Esempio n. 1
0
    def test(self):
        n = 5
        if self.replay_buffer_type == "ReplayBuffer":
            rbuf = replay_buffers.ReplayBuffer(capacity=None, num_steps=n)
        elif self.replay_buffer_type == "PrioritizedReplayBuffer":
            rbuf = replay_buffers.PrioritizedReplayBuffer(capacity=None,
                                                          num_steps=n)
        else:
            assert False

        # 2 transitions for env_id=0
        for _ in range(2):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            rbuf.append(env_id=0, **trans1)
        # 4 transitions for env_id=1 with a terminal state
        for i in range(4):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=(i == 3),
            )
            rbuf.append(env_id=1, **trans1)
        # 9 transitions for env_id=2
        for _ in range(9):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            rbuf.append(env_id=2, **trans1)

        # It should have:
        #   - 4 transitions from env_id=1
        #   - 5 transitions from env_id=2
        assert len(rbuf) == 9

        # env_id=0 episode ends
        rbuf.stop_current_episode(env_id=0)

        # Now it should have 9 + 2 = 11 transitions
        assert len(rbuf) == 11

        # env_id=2 episode ends
        rbuf.stop_current_episode(env_id=2)

        # Finally it should have 9 + 2 + 4 = 15 transitions
        assert len(rbuf) == 15
Esempio n. 2
0
    def test_capacity(self):
        capacity = self.capacity
        if capacity is None:
            return

        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity)
        # Fill the buffer
        for _ in range(capacity):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=True,
            )
            rbuf.append(**trans1)
        assert len(rbuf) == capacity

        # Add a new transition
        trans2 = dict(
            state=1,
            action=1,
            reward=2,
            next_state=3,
            next_action=4,
            is_state_terminal=True,
        )
        rbuf.append(**trans2)
        # The size should not change
        assert len(rbuf) == capacity
Esempio n. 3
0
 def setUp(self):
     self.rbuf = replay_buffers.PrioritizedReplayBuffer(100)
     self.trans1 = dict(
         state=0,
         action=1,
         reward=2,
         next_state=3,
         next_action=4,
         is_state_terminal=True,
     )
     self.rbuf.append(**self.trans1)
Esempio n. 4
0
    def test_normalize_by_max(self):

        rbuf = replay_buffers.PrioritizedReplayBuffer(
            self.capacity,
            normalize_by_max=self.normalize_by_max,
            error_max=1000,
            num_steps=self.num_steps,
        )

        # Add 100 transitions
        for i in range(100):
            trans = dict(
                state=i,
                action=1,
                reward=2,
                next_state=i + 1,
                next_action=1,
                is_state_terminal=False,
            )
            rbuf.append(**trans)
        assert len(rbuf) == 100

        def set_errors_based_on_state(rbuf, samples):
            # Use the value of 'state' as an error, so that state 0 will have
            # the smallest error, thus the largest weight
            errors = [s[0]["state"] for s in samples]
            rbuf.update_errors(errors)

        # Assign different errors to all the transitions first
        samples = rbuf.sample(100)
        set_errors_based_on_state(rbuf, samples)

        # Repeatedly check how weights are normalized
        for i in range(100):
            samples = rbuf.sample(i + 1)
            # All the weights must be unique
            assert len(set(s[0]["weight"] for s in samples)) == len(samples)
            # Now check the maximum weight in a minibatch
            max_w = max([s[0]["weight"] for s in samples])
            if self.normalize_by_max == "batch":
                # Maximum weight in a minibatch must be 1
                np.testing.assert_allclose(max_w, 1)
            elif self.normalize_by_max == "memory":
                # Maximum weight in a minibatch must be less than 1 unless
                # the minibatch contains the transition of least error.
                if any(s[0]["state"] == 0 for s in samples):
                    np.testing.assert_allclose(max_w, 1)
                else:
                    assert max_w < 1
            set_errors_based_on_state(rbuf, samples)
def main():
    env = MapRootEnv()

    size = env.observation_space.shape[:2]  # 動画の画面サイズ
    size = (size[1], size[0])  # width, heightの順
    action_size = 4
    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = MyDistributionalDuelingDQN(action_size, n_atoms, v_min, v_max,
                                        env.input_shape[2])
    pnn.to_factorized_noisy(q_func)
    # print(q_func.main_stream.mu)
    agent = agents.CategoricalDoubleDQN(
        q_func,
        None,
        replay_buffers.PrioritizedReplayBuffer(),
        0.99,
        None,
        gpu=0)
    agent.load("results\\first_end\\best")

    final_reward = 0
    obs = env.reset()
    with agent.eval_mode():
        while True:
            env.render()
            action = agent.act(obs)
            obs, reward, done, _ = env.step(action)
            final_reward += reward
            # print("reward:", reward)
            if done:
                print("Complete!")
                break
        print("final reward:", final_reward)
        trail = env.env.get_trail()
        print("trail len: ", len(trail))
        env.render_trailed_image()
        cv2.waitKey(0)
Esempio n. 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="SlimeVolley-v0")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=(
            "Directory path to save output files."
            " If it does not exist, it will be created."
        ),
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--noisy-net-sigma", type=float, default=0.1)
    parser.add_argument("--steps", type=int, default=2 * 10 ** 6)
    parser.add_argument("--replay-start-size", type=int, default=1600)
    parser.add_argument("--eval-n-episodes", type=int, default=1000)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=(
            "Monitor env. Videos and additional information are saved as output files."
        ),
    )
    parser.add_argument("--gamma", type=float, default=0.98)
    parser.add_argument("--v-max", type=float, default=1)
    parser.add_argument("--n-step-return", type=int, default=3)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2 ** 31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        if "SlimeVolley" in args.env:
            # You need to install slimevolleygym
            import slimevolleygym  # NOQA

        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env.seed(int(env_seed))
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training"
            )
        if args.render:
            env = pfrl.wrappers.Render(env)
        if isinstance(env.action_space, gym.spaces.MultiBinary):
            env = MultiBinaryAsDiscreteAction(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    obs_size = env.observation_space.low.size
    n_actions = env.action_space.n

    n_atoms = 51
    v_max = args.v_max
    v_min = -args.v_max
    hidden_size = 512
    q_func = nn.Sequential(
        nn.Linear(obs_size, hidden_size),
        nn.ReLU(),
        nn.Linear(hidden_size, hidden_size),
        nn.ReLU(),
        DistributionalDuelingHead(hidden_size, n_actions, n_atoms, v_min, v_max),
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32)

    # Noisy nets
    pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
    # Turn off explorer
    explorer = explorers.Greedy()

    # Use the same eps as https://arxiv.org/abs/1710.02298
    opt = torch.optim.Adam(q_func.parameters(), 1e-4, eps=1.5e-4)

    # Prioritized Replay
    # Anneal beta from beta0 to 1 throughout training
    update_interval = 1
    betasteps = args.steps / update_interval
    rbuf = replay_buffers.PrioritizedReplayBuffer(
        10 ** 6,
        alpha=0.5,
        beta0=0.4,
        betasteps=betasteps,
        num_steps=args.n_step_return,
        normalize_by_max="memory",
    )

    agent = agents.CategoricalDoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        minibatch_size=32,
        replay_start_size=args.replay_start_size,
        target_update_interval=2000,
        update_interval=update_interval,
        batch_accumulator="mean",
        phi=phi,
        max_grad_norm=10,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_episodes,
        )
        print(
            "n_episodes: {} mean: {} median: {} stdev {}".format(
                eval_stats["episodes"],
                eval_stats["mean"],
                eval_stats["median"],
                eval_stats["stdev"],
            )
        )

    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_episodes,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )
Esempio n. 7
0
    def test_save_and_load(self):
        capacity = self.capacity
        num_steps = self.num_steps

        tempdir = tempfile.mkdtemp()

        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity,
                                                      num_steps=num_steps)

        # Add two transitions
        correct_item = collections.deque([], maxlen=num_steps)
        for _ in range(num_steps):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            correct_item.append(trans1)
            rbuf.append(**trans1)
        correct_item2 = copy.deepcopy(correct_item)
        trans2 = dict(
            state=1,
            action=1,
            reward=2,
            next_state=3,
            next_action=4,
            is_state_terminal=True,
        )
        correct_item2.append(trans2)
        rbuf.append(**trans2)

        # Now it has two transitions
        assert len(rbuf) == 2

        # Save
        filename = os.path.join(tempdir, "rbuf.pkl")
        rbuf.save(filename)

        # Initialize rbuf
        rbuf = replay_buffers.PrioritizedReplayBuffer(capacity,
                                                      num_steps=num_steps)

        # Of course it has no transition yet
        assert len(rbuf) == 0

        # Load the previously saved buffer
        rbuf.load(filename)

        # Now it has two transitions again
        assert len(rbuf) == 2

        # And sampled transitions are exactly what I added!
        s2 = rbuf.sample(2)
        del s2[0][0]["weight"]
        del s2[1][0]["weight"]
        if s2[0][num_steps - 1]["state"] == 0:
            assert s2[0] == list(correct_item)
            assert s2[1] == list(correct_item2)
        else:
            assert s2[0] == list(correct_item2)
            assert s2[1] == list(correct_item)
Esempio n. 8
0
    def test_append_and_sample(self):
        capacity = self.capacity
        num_steps = self.num_steps
        rbuf = replay_buffers.PrioritizedReplayBuffer(
            capacity,
            normalize_by_max=self.normalize_by_max,
            error_max=5,
            num_steps=num_steps,
        )

        assert len(rbuf) == 0

        # Add one and sample one
        correct_item = collections.deque([], maxlen=num_steps)
        for _ in range(num_steps):
            trans1 = dict(
                state=0,
                action=1,
                reward=2,
                next_state=3,
                next_action=4,
                is_state_terminal=False,
            )
            correct_item.append(trans1)
            rbuf.append(**trans1)
        assert len(rbuf) == 1
        s1 = rbuf.sample(1)
        rbuf.update_errors([3.14])
        assert len(s1) == 1
        np.testing.assert_allclose(s1[0][0]["weight"], 1.0)
        del s1[0][0]["weight"]
        assert s1[0] == list(correct_item)

        # Add two and sample two, which must be unique
        correct_item2 = copy.deepcopy(correct_item)
        trans2 = dict(
            state=1,
            action=1,
            reward=2,
            next_state=3,
            next_action=4,
            is_state_terminal=True,
        )
        correct_item2.append(trans2)
        rbuf.append(**trans2)
        assert len(rbuf) == 2
        s2 = rbuf.sample(2)
        rbuf.update_errors([3.14, 2.71])
        assert len(s2) == 2
        del s2[0][0]["weight"]
        del s2[1][0]["weight"]
        if s2[0][num_steps - 1]["state"] == 1:
            assert s2[0] == list(correct_item2)
            assert s2[1] == list(correct_item)
        else:
            assert s2[0] == list(correct_item)
            assert s2[1] == list(correct_item2)

        # Weights should be different for different TD-errors
        s3 = rbuf.sample(2)
        assert not np.allclose(s3[0][0]["weight"], s3[1][0]["weight"])

        # Weights should be equal for different but clipped TD-errors
        rbuf.update_errors([5, 10])
        s3 = rbuf.sample(2)
        np.testing.assert_allclose(s3[0][0]["weight"], s3[1][0]["weight"])

        # Weights should be equal for the same TD-errors
        rbuf.update_errors([3.14, 3.14])
        s4 = rbuf.sample(2)
        np.testing.assert_allclose(s4[0][0]["weight"], s4[1][0]["weight"])
Esempio n. 9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--eval-epsilon", type=float, default=0.0)
    parser.add_argument("--noisy-net-sigma", type=float, default=0.5)
    parser.add_argument("--steps", type=int, default=5 * 10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--replay-start-size", type=int, default=2 * 10**4)
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--n-best-episodes", type=int, default=200)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = DistributionalDuelingDQN(
        n_actions,
        n_atoms,
        v_min,
        v_max,
    )

    # Noisy nets
    pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
    # Turn off explorer
    explorer = explorers.Greedy()

    # Use the same hyper parameters as https://arxiv.org/abs/1710.02298
    opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4)

    # Prioritized Replay
    # Anneal beta from beta0 to 1 throughout training
    update_interval = 4
    betasteps = args.steps / update_interval
    rbuf = replay_buffers.PrioritizedReplayBuffer(
        10**6,
        alpha=0.5,
        beta0=0.4,
        betasteps=betasteps,
        num_steps=3,
        normalize_by_max="memory",
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = agents.CategoricalDoubleDQN
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        minibatch_size=32,
        replay_start_size=args.replay_start_size,
        target_update_interval=32000,
        update_interval=update_interval,
        batch_accumulator="mean",
        phi=phi,
    )

    if args.load or args.load_pretrained:
        # either load_ or load_pretrained must be false
        assert not args.load or not args.load_pretrained
        if args.load:
            agent.load(args.load)
        else:
            agent.load(
                utils.download_model("Rainbow",
                                     args.env,
                                     model_type=args.pretrained_type)[0])

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print("n_episodes: {} mean: {} median: {} stdev {}".format(
            eval_stats["episodes"],
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))

    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
        )

        dir_of_best_network = os.path.join(args.outdir, "best")
        agent.load(dir_of_best_network)

        # run 200 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=args.max_frames / 4,
            logger=None,
        )
        with open(os.path.join(args.outdir, "bestscores.json"), "w") as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))
Esempio n. 10
0
def main():
    import logging

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--env", type=str, default="CartPole-v1")
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--final-exploration-steps", type=int, default=1000)
    parser.add_argument("--start-epsilon", type=float, default=1.0)
    parser.add_argument("--end-epsilon", type=float, default=0.1)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--steps", type=int, default=10**8)
    parser.add_argument("--prioritized-replay", action="store_true")
    parser.add_argument("--replay-start-size", type=int, default=50)
    parser.add_argument("--target-update-interval", type=int, default=100)
    parser.add_argument("--target-update-method", type=str, default="hard")
    parser.add_argument("--soft-update-tau", type=float, default=1e-2)
    parser.add_argument("--update-interval", type=int, default=1)
    parser.add_argument("--eval-n-runs", type=int, default=100)
    parser.add_argument("--eval-interval", type=int, default=1000)
    parser.add_argument("--n-hidden-channels", type=int, default=12)
    parser.add_argument("--n-hidden-layers", type=int, default=3)
    parser.add_argument("--gamma", type=float, default=0.95)
    parser.add_argument("--minibatch-size", type=int, default=None)
    parser.add_argument("--render-train", action="store_true")
    parser.add_argument("--render-eval", action="store_true")
    parser.add_argument("--monitor", action="store_true")
    parser.add_argument("--reward-scale-factor", type=float, default=1.0)
    args = parser.parse_args()

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        env = gym.make(args.env)
        env_seed = 2**32 - 1 - args.seed if test else args.seed
        env.seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if (args.render_eval and test) or (args.render_train and not test):
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.max_episode_steps
    obs_size = env.observation_space.low.size
    action_space = env.action_space

    n_atoms = 51
    v_max = 500
    v_min = 0

    n_actions = action_space.n
    q_func = q_functions.DistributionalFCStateQFunctionWithDiscreteAction(
        obs_size,
        n_actions,
        n_atoms,
        v_min,
        v_max,
        n_hidden_channels=args.n_hidden_channels,
        n_hidden_layers=args.n_hidden_layers,
    )
    # Use epsilon-greedy for exploration
    explorer = explorers.LinearDecayEpsilonGreedy(
        args.start_epsilon,
        args.end_epsilon,
        args.final_exploration_steps,
        action_space.sample,
    )

    opt = torch.optim.Adam(q_func.parameters(), 1e-3)

    rbuf_capacity = 50000  # 5 * 10 ** 5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps -
                     args.replay_start_size) // args.update_interval
        rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity,
                                                      betasteps=betasteps)
    else:
        rbuf = replay_buffers.ReplayBuffer(rbuf_capacity)

    agent = pfrl.agents.CategoricalDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit,
        )
Esempio n. 11
0
def main():
    import logging

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--env", type=str, default="Pendulum-v0")
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--final-exploration-steps", type=int, default=10**4)
    parser.add_argument("--start-epsilon", type=float, default=1.0)
    parser.add_argument("--end-epsilon", type=float, default=0.1)
    parser.add_argument("--noisy-net-sigma", type=float, default=None)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--steps", type=int, default=10**5)
    parser.add_argument("--prioritized-replay", action="store_true")
    parser.add_argument("--replay-start-size", type=int, default=1000)
    parser.add_argument("--target-update-interval", type=int, default=10**2)
    parser.add_argument("--target-update-method", type=str, default="hard")
    parser.add_argument("--soft-update-tau", type=float, default=1e-2)
    parser.add_argument("--update-interval", type=int, default=1)
    parser.add_argument("--eval-n-runs", type=int, default=100)
    parser.add_argument("--eval-interval", type=int, default=10**4)
    parser.add_argument("--n-hidden-channels", type=int, default=100)
    parser.add_argument("--n-hidden-layers", type=int, default=2)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--minibatch-size", type=int, default=None)
    parser.add_argument("--render-train", action="store_true")
    parser.add_argument("--render-eval", action="store_true")
    parser.add_argument("--monitor", action="store_true")
    parser.add_argument("--reward-scale-factor", type=float, default=1e-3)
    parser.add_argument(
        "--actor-learner",
        action="store_true",
        help="Enable asynchronous sampling with asynchronous actor(s)",
    )  # NOQA
    parser.add_argument(
        "--num-envs",
        type=int,
        default=1,
        help=("The number of environments for sampling (only effective with"
              " --actor-learner enabled)"),
    )  # NOQA
    args = parser.parse_args()

    # Set a random seed used in PFRL
    utils.set_random_seed(args.seed)

    args.outdir = experiments.prepare_output_dir(args,
                                                 args.outdir,
                                                 argv=sys.argv)
    print("Output files are saved in {}".format(args.outdir))

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    def clip_action_filter(a):
        return np.clip(a, action_space.low, action_space.high)

    def make_env(idx=0, test=False):
        env = gym.make(args.env)
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        utils.set_random_seed(env_seed)
        # Cast observations to float32 because our model uses float32
        env = pfrl.wrappers.CastObservationToFloat32(env)
        if args.monitor:
            env = pfrl.wrappers.Monitor(env, args.outdir)
        if isinstance(env.action_space, spaces.Box):
            utils.env_modifiers.make_action_filtered(env, clip_action_filter)
        if not test:
            # Scale rewards (and thus returns) to a reasonable range so that
            # training is easier
            env = pfrl.wrappers.ScaleReward(env, args.reward_scale_factor)
        if (args.render_eval and test) or (args.render_train and not test):
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    timestep_limit = env.spec.max_episode_steps
    obs_space = env.observation_space
    obs_size = obs_space.low.size
    action_space = env.action_space

    if isinstance(action_space, spaces.Box):
        action_size = action_space.low.size
        # Use NAF to apply DQN to continuous action spaces
        q_func = q_functions.FCQuadraticStateQFunction(
            obs_size,
            action_size,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
            action_space=action_space,
        )
        # Use the Ornstein-Uhlenbeck process for exploration
        ou_sigma = (action_space.high - action_space.low) * 0.2
        explorer = explorers.AdditiveOU(sigma=ou_sigma)
    else:
        n_actions = action_space.n
        q_func = q_functions.FCStateQFunctionWithDiscreteAction(
            obs_size,
            n_actions,
            n_hidden_channels=args.n_hidden_channels,
            n_hidden_layers=args.n_hidden_layers,
        )
        # Use epsilon-greedy for exploration
        explorer = explorers.LinearDecayEpsilonGreedy(
            args.start_epsilon,
            args.end_epsilon,
            args.final_exploration_steps,
            action_space.sample,
        )

    if args.noisy_net_sigma is not None:
        pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    opt = optim.Adam(q_func.parameters())

    rbuf_capacity = 5 * 10**5
    if args.minibatch_size is None:
        args.minibatch_size = 32
    if args.prioritized_replay:
        betasteps = (args.steps -
                     args.replay_start_size) // args.update_interval
        rbuf = replay_buffers.PrioritizedReplayBuffer(rbuf_capacity,
                                                      betasteps=betasteps)
    else:
        rbuf = replay_buffers.ReplayBuffer(rbuf_capacity)

    agent = DQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        minibatch_size=args.minibatch_size,
        target_update_method=args.target_update_method,
        soft_update_tau=args.soft_update_tau,
    )

    if args.load:
        agent.load(args.load)

    eval_env = make_env(test=True)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=timestep_limit,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))

    elif not args.actor_learner:

        print(
            "WARNING: Since https://github.com/pfnet/pfrl/pull/112 we have started"
            " setting `eval_during_episode=True` in this script, which affects the"
            " timings of evaluation phases.")

        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            eval_env=eval_env,
            train_max_episode_len=timestep_limit,
            eval_during_episode=True,
        )
    else:
        # using impala mode when given num of envs

        # When we use multiple envs, it is critical to ensure each env
        # can occupy a CPU core to get the best performance.
        # Therefore, we need to prevent potential CPU over-provision caused by
        # multi-threading in Openmp and Numpy.
        # Disable the multi-threading on Openmp and Numpy.
        os.environ["OMP_NUM_THREADS"] = "1"  # NOQA

        (
            make_actor,
            learner,
            poller,
            exception_event,
        ) = agent.setup_actor_learner_training(args.num_envs)

        poller.start()
        learner.start()

        experiments.train_agent_async(
            processes=args.num_envs,
            make_agent=make_actor,
            make_env=make_env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            stop_event=learner.stop_event,
            exception_event=exception_event,
        )

        poller.stop()
        learner.stop()
        poller.join()
        learner.join()
Esempio n. 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--env",
        type=str,
        default="BreakoutNoFrameskip-v4",
        help="OpenAI Atari domain to perform algorithm on.",
    )
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu",
                        type=int,
                        default=0,
                        help="GPU to use, set to -1 if no GPU.")
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument(
        "--final-exploration-frames",
        type=int,
        default=10**6,
        help="Timesteps after which we stop " + "annealing exploration rate",
    )
    parser.add_argument(
        "--final-epsilon",
        type=float,
        default=0.01,
        help="Final value of epsilon during training.",
    )
    parser.add_argument(
        "--eval-epsilon",
        type=float,
        default=0.001,
        help="Exploration epsilon used during eval episodes.",
    )
    parser.add_argument("--noisy-net-sigma", type=float, default=None)
    parser.add_argument(
        "--arch",
        type=str,
        default="doubledqn",
        choices=["nature", "nips", "dueling", "doubledqn"],
        help="Network architecture to use.",
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=5 * 10**7,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10**4,
        help="Minimum replay buffer size before " +
        "performing gradient updates.",
    )
    parser.add_argument(
        "--target-update-interval",
        type=int,
        default=3 * 10**4,
        help="Frequency (in timesteps) at which " +
        "the target network is updated.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=10**5,
        help="Frequency (in timesteps) of evaluation phase.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=4,
        help="Frequency (in timesteps) of network updates.",
    )
    parser.add_argument("--eval-n-runs", type=int, default=10)
    parser.add_argument("--no-clip-delta",
                        dest="clip_delta",
                        action="store_false")
    parser.add_argument("--num-step-return", type=int, default=1)
    parser.set_defaults(clip_delta=True)
    parser.add_argument("--agent",
                        type=str,
                        default="DoubleDQN",
                        choices=["DQN", "DoubleDQN", "PAL"])
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate.")
    parser.add_argument(
        "--prioritized",
        action="store_true",
        default=False,
        help="Use prioritized experience replay.",
    )
    parser.add_argument(
        "--checkpoint-frequency",
        type=int,
        default=None,
        help="Frequency at which agents are stored.",
    )
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(test):
        # Use different random seeds for train and test envs
        env_seed = test_seed if test else train_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
        )
        env.seed(int(env_seed))
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    env = make_env(test=False)
    eval_env = make_env(test=True)

    n_actions = env.action_space.n
    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()
    else:
        explorer = explorers.LinearDecayEpsilonGreedy(
            1.0,
            args.final_epsilon,
            args.final_exploration_frames,
            lambda: np.random.randint(n_actions),
        )

    # Use the Nature paper's hyperparameters
    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
        q_func.parameters(),
        lr=args.lr,
        alpha=0.95,
        momentum=0.0,
        eps=1e-2,
        centered=True,
    )

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffers.PrioritizedReplayBuffer(
            10**6,
            alpha=0.6,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=args.num_step_return,
        )
    else:
        rbuf = replay_buffers.ReplayBuffer(10**6, args.num_step_return)

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        clip_delta=args.clip_delta,
        update_interval=args.update_interval,
        batch_accumulator="sum",
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=None,
                                                  n_episodes=args.eval_n_runs)
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=None,
            checkpoint_freq=args.checkpoint_frequency,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            eval_env=eval_env,
        )
Esempio n. 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=(
            "Directory path to save output files."
            " If it does not exist, it will be created."
        ),
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 31)")
    parser.add_argument(
        "--gpu", type=int, default=0, help="GPU to use, set to -1 if no GPU."
    )
    parser.add_argument(
        "--demo",
        action="store_true",
        default=False,
        help="Evaluate the agent without training.",
    )
    parser.add_argument(
        "--load",
        type=str,
        default=None,
        help="Load a saved agent from a given directory.",
    )
    parser.add_argument(
        "--final-exploration-steps",
        type=int,
        default=5 * 10 ** 5,
        help="Timesteps after which we stop annealing exploration rate",
    )
    parser.add_argument(
        "--final-epsilon",
        type=float,
        default=0.2,
        help="Final value of epsilon during training.",
    )
    parser.add_argument(
        "--steps",
        type=int,
        default=2 * 10 ** 6,
        help="Total number of timesteps to train the agent.",
    )
    parser.add_argument(
        "--replay-start-size",
        type=int,
        default=5 * 10 ** 4,
        help="Minimum replay buffer size before performing gradient updates.",
    )
    parser.add_argument(
        "--target-update-interval",
        type=int,
        default=1 * 10 ** 4,
        help="Frequency (in timesteps) at which the target network is updated.",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=10 ** 5,
        help="Frequency (in timesteps) of evaluation phase.",
    )
    parser.add_argument(
        "--update-interval",
        type=int,
        default=1,
        help="Frequency (in timesteps) of network updates.",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=100,
        help="Number of episodes used for evaluation.",
    )
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument(
        "--num-envs", type=int, default=1, help="Number of envs run in parallel."
    )
    parser.add_argument(
        "--batch-size", type=int, default=32, help="Batch size used for training."
    )
    parser.add_argument(
        "--record",
        action="store_true",
        default=False,
        help="Record videos of evaluation envs. --render should also be specified.",
    )
    parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor.")
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2 ** 32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    max_episode_steps = 8

    def make_env(idx, test):
        from pybullet_envs.bullet.kuka_diverse_object_gym_env import (
            KukaDiverseObjectEnv,
        )  # NOQA

        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
        # Set a random seed for this subprocess
        utils.set_random_seed(env_seed)
        env = KukaDiverseObjectEnv(
            isDiscrete=True,
            renders=args.render and (args.demo or not test),
            height=84,
            width=84,
            maxSteps=max_episode_steps,
            isTest=test,
        )
        # Disable file caching to keep memory usage small
        env._p.setPhysicsEngineParameter(enableFileCaching=False)
        assert env.observation_space is None
        env.observation_space = gym.spaces.Box(
            low=0, high=255, shape=(84, 84, 3), dtype=np.uint8
        )
        # (84, 84, 3) -> (3, 84, 84)
        env = TransposeObservation(env, (2, 0, 1))
        env = ObserveElapsedSteps(env, max_episode_steps)
        # KukaDiverseObjectEnv internally asserts int actions
        env = CastAction(env, int)
        env.seed(int(env_seed))
        if test and args.record:
            assert args.render, "To use --record, --render needs be specified."
            video_dir = os.path.join(args.outdir, "video_{}".format(idx))
            os.mkdir(video_dir)
            env = RecordMovie(env, video_dir)
        return env

    def make_batch_env(test):
        return pfrl.envs.MultiprocessVectorEnv(
            [functools.partial(make_env, idx, test) for idx in range(args.num_envs)]
        )

    eval_env = make_batch_env(test=True)
    n_actions = eval_env.action_space.n

    q_func = GraspingQFunction(n_actions, max_episode_steps)

    # Use the hyper parameters of the Nature paper
    opt = pfrl.optimizers.RMSpropEpsInsideSqrt(
        q_func.parameters(),
        lr=args.lr,
        alpha=0.95,
        momentum=0.0,
        eps=1e-2,
        centered=True,
    )

    # Anneal beta from beta0 to 1 throughout training
    betasteps = args.steps / args.update_interval
    rbuf = replay_buffers.PrioritizedReplayBuffer(
        10 ** 6, alpha=0.6, beta0=0.4, betasteps=betasteps
    )

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0,
        args.final_epsilon,
        args.final_exploration_steps,
        lambda: np.random.randint(n_actions),
    )

    def phi(x):
        # Feature extractor
        image, elapsed_steps = x
        # Normalize RGB values: [0, 255] -> [0, 1]
        norm_image = np.asarray(image, dtype=np.float32) / 255
        return norm_image, elapsed_steps

    agent = pfrl.agents.DoubleDQN(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=args.gamma,
        explorer=explorer,
        minibatch_size=args.batch_size,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        update_interval=args.update_interval,
        batch_accumulator="sum",
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=eval_env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs
        )
        print(
            "n_runs: {} mean: {} median: {} stdev {}".format(
                args.eval_n_runs,
                eval_stats["mean"],
                eval_stats["median"],
                eval_stats["stdev"],
            )
        )
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=eval_env,
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="BreakoutNoFrameskip-v4")
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--final-exploration-frames", type=int, default=10**6)
    parser.add_argument("--final-epsilon", type=float, default=0.01)
    parser.add_argument("--eval-epsilon", type=float, default=0.001)
    parser.add_argument("--noisy-net-sigma", type=float, default=None)
    parser.add_argument(
        "--arch",
        type=str,
        default="doubledqn",
        choices=["nature", "nips", "dueling", "doubledqn"],
    )
    parser.add_argument("--steps", type=int, default=5 * 10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--replay-start-size", type=int, default=5 * 10**4)
    parser.add_argument("--target-update-interval",
                        type=int,
                        default=3 * 10**4)
    parser.add_argument("--eval-interval", type=int, default=10**5)
    parser.add_argument("--update-interval", type=int, default=4)
    parser.add_argument("--eval-n-runs", type=int, default=10)
    parser.add_argument("--no-clip-delta",
                        dest="clip_delta",
                        action="store_false")
    parser.set_defaults(clip_delta=True)
    parser.add_argument("--agent",
                        type=str,
                        default="DoubleDQN",
                        choices=["DQN", "DoubleDQN", "PAL"])
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--lr",
                        type=float,
                        default=2.5e-4,
                        help="Learning rate")
    parser.add_argument(
        "--prioritized",
        action="store_true",
        default=False,
        help="Use prioritized experience replay.",
    )
    parser.add_argument("--num-envs", type=int, default=1)
    parser.add_argument("--n-step-return", type=int, default=1)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for different subprocesses.
    # If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
    # If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
    process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
    assert process_seeds.max() < 2**32

    args.outdir = experiments.prepare_output_dir(args, args.outdir)
    print("Output files are saved in {}".format(args.outdir))

    def make_env(idx, test):
        # Use different random seeds for train and test envs
        process_seed = int(process_seeds[idx])
        env_seed = 2**32 - 1 - process_seed if test else process_seed
        env = atari_wrappers.wrap_deepmind(
            atari_wrappers.make_atari(args.env, max_frames=args.max_frames),
            episode_life=not test,
            clip_rewards=not test,
            frame_stack=False,
        )
        if test:
            # Randomize actions like epsilon-greedy in evaluation as well
            env = pfrl.wrappers.RandomizeAction(env, args.eval_epsilon)
        env.seed(env_seed)
        if args.monitor:
            env = pfrl.wrappers.Monitor(
                env, args.outdir, mode="evaluation" if test else "training")
        if args.render:
            env = pfrl.wrappers.Render(env)
        return env

    def make_batch_env(test):
        vec_env = pfrl.envs.MultiprocessVectorEnv([
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(args.num_envs))
        ])
        vec_env = pfrl.wrappers.VectorFrameStack(vec_env, 4)
        return vec_env

    sample_env = make_env(0, test=False)

    n_actions = sample_env.action_space.n
    q_func = parse_arch(args.arch, n_actions)

    if args.noisy_net_sigma is not None:
        pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
        # Turn off explorer
        explorer = explorers.Greedy()

    # Use the same hyper parameters as the Nature paper's
    opt = optim.RMSprop(
        q_func.parameters(),
        lr=args.lr,
        alpha=0.95,
        momentum=0.0,
        eps=1e-2,
        centered=True,
    )

    # Select a replay buffer to use
    if args.prioritized:
        # Anneal beta from beta0 to 1 throughout training
        betasteps = args.steps / args.update_interval
        rbuf = replay_buffers.PrioritizedReplayBuffer(
            10**6,
            alpha=0.6,
            beta0=0.4,
            betasteps=betasteps,
            num_steps=args.n_step_return,
        )
    else:
        rbuf = replay_buffers.ReplayBuffer(10**6, num_steps=args.n_step_return)

    explorer = explorers.LinearDecayEpsilonGreedy(
        1.0,
        args.final_epsilon,
        args.final_exploration_frames,
        lambda: np.random.randint(n_actions),
    )

    def phi(x):
        # Feature extractor
        return np.asarray(x, dtype=np.float32) / 255

    Agent = parse_agent(args.agent)
    agent = Agent(
        q_func,
        opt,
        rbuf,
        gpu=args.gpu,
        gamma=0.99,
        explorer=explorer,
        replay_start_size=args.replay_start_size,
        target_update_interval=args.target_update_interval,
        clip_delta=args.clip_delta,
        update_interval=args.update_interval,
        batch_accumulator="sum",
        phi=phi,
    )

    if args.load:
        agent.load(args.load)

    if args.demo:
        eval_stats = experiments.eval_performance(
            env=make_batch_env(test=True),
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
        )
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        experiments.train_agent_batch_with_evaluation(
            agent=agent,
            env=make_batch_env(test=False),
            eval_env=make_batch_env(test=True),
            steps=args.steps,
            eval_n_steps=None,
            eval_n_episodes=args.eval_n_runs,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=False,
            log_interval=1000,
        )
Esempio n. 15
0
else:
    explorer = explorers.LinearDecayEpsilonGreedy(
        params.epsilon_max, params.epsilon_min, params.epsilon_steps,
        lambda: np.random.randint(env.action_space.n)
    ) if params.explorer_method == 0 else RandomSelectionEpsilonGreedy(
        params.epsilon_min, params.epsilon_max, params.epsilon_num, params.epsilon_interval,
        lambda: np.random.randint(env.action_space.n)
    )

optimizer = torch.optim.Adam(q_func.parameters(), lr=params.lr, eps=1e-08) # eps=1.5*10**-4)


rbuf = replay_buffers.PrioritizedReplayBuffer(
    params.per_size,
    alpha=params.per_alpha,
    beta0=params.per_beta,
    betasteps=params.per_beta_steps,
    num_steps=params.per_num_steps,
    normalize_by_max="memory"
)

def phi(x):
    # Feature extractor
    return np.asarray(x, dtype=np.float32) / 255
if params.rainbow:
    agent = CategoricalDoubleDQN(
        q_func,
        optimizer,
        rbuf,
        gpu=0,
        gamma=params.discount,
        explorer=explorer,
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--outdir",
        type=str,
        default="results",
        help=("Directory path to save output files."
              " If it does not exist, it will be created."),
    )
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 31)")
    parser.add_argument("--gpu", type=int, default=0)
    parser.add_argument("--demo", action="store_true", default=False)
    parser.add_argument("--load-pretrained",
                        action="store_true",
                        default=False)
    parser.add_argument("--pretrained-type",
                        type=str,
                        default="best",
                        choices=["best", "final"])
    parser.add_argument("--load", type=str, default=None)
    parser.add_argument("--eval-epsilon", type=float, default=0.0)
    parser.add_argument("--noisy-net-sigma", type=float, default=0.2)
    parser.add_argument("--steps", type=int, default=5 * 10**7)
    parser.add_argument(
        "--max-frames",
        type=int,
        default=30 * 60 * 60,  # 30 minutes with 60 fps
        help="Maximum number of frames for each episode.",
    )
    parser.add_argument("--replay-start-size", type=int, default=2 * 10**3)
    parser.add_argument("--eval-n-steps", type=int, default=125000)
    parser.add_argument("--eval-interval", type=int, default=250000)
    parser.add_argument(
        "--log-level",
        type=int,
        default=20,
        help="Logging level. 10:DEBUG, 20:INFO etc.",
    )
    parser.add_argument(
        "--render",
        action="store_true",
        default=False,
        help="Render env states in a GUI window.",
    )
    parser.add_argument(
        "--monitor",
        action="store_true",
        default=False,
        help=
        ("Monitor env. Videos and additional information are saved as output files."
         ),
    )
    parser.add_argument("--n-best-episodes", type=int, default=200)
    args = parser.parse_args()

    import logging

    logging.basicConfig(level=args.log_level)

    # Set a random seed used in PFRL.
    utils.set_random_seed(args.seed)

    # Set different random seeds for train and test envs.
    train_seed = args.seed
    test_seed = 2**31 - 1 - args.seed

    test_ID = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
    args.outdir = experiments.prepare_output_dir(args, args.outdir, test_ID)
    print("Output files are saved in {}".format(args.outdir))

    env = MapRootEnv()
    eval_env = MapRootEnv()

    n_actions = env.action_space.n
    input_shape = env.input_shape

    n_atoms = 51
    v_max = 10
    v_min = -10
    q_func = MyDistributionalDuelingDQN(n_actions, n_atoms, v_min, v_max,
                                        input_shape[2])

    # Noisy nets
    pnn.to_factorized_noisy(q_func, sigma_scale=args.noisy_net_sigma)
    # Turn off explorer
    explorer = explorers.Greedy()

    # Use the same hyper parameters as https://arxiv.org/abs/1710.02298
    opt = torch.optim.Adam(q_func.parameters(), 6.25e-5, eps=1.5 * 10**-4)

    # Prioritized Replay
    # Anneal beta from beta0 to 1 throughout training
    update_interval = 4
    betasteps = args.steps / update_interval
    rbuf = replay_buffers.PrioritizedReplayBuffer(
        5 * 10**4,
        alpha=0.5,
        beta0=0.4,
        betasteps=betasteps,
        num_steps=3,
        normalize_by_max="batch",
    )

    Agent = agents.CategoricalDoubleDQN
    agent = Agent(q_func,
                  opt,
                  rbuf,
                  gpu=args.gpu,
                  gamma=0.80,
                  explorer=explorer,
                  minibatch_size=32,
                  replay_start_size=args.replay_start_size,
                  target_update_interval=32000,
                  update_interval=update_interval,
                  batch_accumulator="mean")

    if args.demo:
        eval_stats = experiments.eval_performance(env=eval_env,
                                                  agent=agent,
                                                  n_steps=args.eval_n_steps,
                                                  n_episodes=None)
        print("n_episodes: {} mean: {} median: {} stdev {}".format(
            eval_stats["episodes"],
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))

    else:
        experiments.train_agent_with_evaluation(
            agent=agent,
            env=env,
            steps=args.steps,
            eval_n_steps=args.eval_n_steps,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            outdir=args.outdir,
            save_best_so_far_agent=True,
            eval_env=eval_env,
            logger=TBLogger(args.outdir))

        # dir_of_best_network = os.path.join(args.outdir, "best")
        # agent.load(dir_of_best_network)

        # run 200 evaluation episodes, each capped at 30 mins of play
        stats = experiments.evaluator.eval_performance(
            env=eval_env,
            agent=agent,
            n_steps=None,
            n_episodes=args.n_best_episodes,
            max_episode_len=args.max_frames / 4,
            logger=None)
        with open(os.path.join(args.outdir, "bestscores.json"), "w") as f:
            json.dump(stats, f)
        print("The results of the best scoring network:")
        for stat in stats:
            print(str(stat) + ":" + str(stats[stat]))