Beispiel #1
0
def create_env(args: argparse.Namespace, report_each: int = 100, **kwargs) \
        -> tuple[wrappers.EvaluationEnv, np.ndarray, np.ndarray, np.ndarray]:
    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("Taxi-v3"),
                                 seed=args.seed,
                                 report_each=report_each,
                                 **kwargs)

    # Extract a deterministic MDP into three NumPy arrays
    # - R[state][action] is the reward
    # - D[state][action] is the True/False value indicating end of episode
    # - N[state][action] is the next state
    R, D, N = [
        np.array([[env.P[s][a][0][i] for a in range(env.action_space.n)]
                  for s in range(env.observation_space.n)]) for i in [2, 3, 1]
    ]

    return env, R, D, N
Beispiel #2
0
            np.concatenate(states),
            {
                "actions": np.concatenate(actions),
                "action_probs": np.concatenate(action_probs),
                "advantages": np.concatenate(advantages),
                "returns": np.concatenate(returns)
            },
            batch_size=args.batch_size,
            epochs=args.epochs,
            verbose=0,
        )

        # Periodic evaluation
        iteration += 1
        if iteration % args.evaluate_each == 0:
            for _ in range(args.evaluate_for):
                evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("SingleCollect-v0"), args.seed)

    main(env, args)
Beispiel #3
0
            for i in range(args.envs):
                replay_buffer.append(Transition(state[i], action[i], reward[i], done[i], next_state[i]))
            state = next_state

            # Training
            if len(replay_buffer) >= 4 * args.batch_size:
                # Note that until now we used `np.random.choice` with `replace=False` to generate
                # batch indices. However, this call is extremely slow for large buffers, because
                # it generates a whole permutation. With `np.random.randint`, indices may repeat,
                # but once the buffer is large, it happend with little probability.
                batch = np.random.randint(len(replay_buffer), size=args.batch_size)
                states, actions, rewards, dones, next_states = map(np.array, zip(*[replay_buffer[i] for i in batch]))
                # TODO: Perform the training

        # Periodic evaluation
        for _ in range(args.evaluate_for):
            evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make(args.env), args.seed)

    main(env, args)
Beispiel #4
0

def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None:
    # Fix random seeds and threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    if args.recodex:
        # TODO: Perform evaluation of a trained model.
        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                # TODO: Choose an action
                action = None
                state, reward, done, _ = env.step(action)

    else:
        # TODO: Perform training
        raise NotImplementedError()


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("CartPolePixels-v0"), args.seed)

    main(env, args)
Beispiel #5
0
                {
                    "actions": np.concatenate(actions)[:, a],
                    "action_probs": np.concatenate(action_probs)[:, a],
                    "advantages": np.concatenate(advantages),
                    "returns": np.concatenate(returns)
                },
                batch_size=args.batch_size,
                epochs=args.epochs,
                verbose=0,
            )

        # Periodic evaluation
        iteration += 1
        if iteration % args.evaluate_each == 0:
            for _ in range(args.evaluate_for):
                evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(
        gym.make("MultiCollect{}-v0".format(args.agents)), args.seed)

    main(env, args)
Beispiel #6
0
                next_state, reward, done, _ = env.step(action)

                states.append(state)
                actions.append(action)
                rewards.append(reward)

                state = next_state

            # TODO(reinforce): Compute returns from the received rewards

            # TODO(reinforce): Add states, actions and returns to the training batch

        # TODO(reinforce): Train using the generated batch.

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO(reinforce): Choose greedy action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("CartPole-v1"), args.seed)

    main(env, args)
Beispiel #7
0
            replay_buffer.append(episode)

            # Train the network if enough data is available
            if len(replay_buffer) >= args.batch_size:
                network.train([
                    replay_buffer[i]
                    for i in np.random.choice(len(replay_buffer),
                                              size=args.batch_size,
                                              replace=False)
                ])

        # TODO(memory_game): Maybe evaluate the current performance, using
        # `evaluate_episode()` method returning the achieved return,
        # and setting `training=False` when the performance is high enough.

    # Final evaluation
    while True:
        evaluate_episode(True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(memory_game_environment.make(args.cards),
                                 args.seed,
                                 evaluate_for=args.evaluate_for,
                                 report_each=args.evaluate_for)

    main(env, args)
Beispiel #8
0
            # TODO: Predict action distribution using `network.predict_actions`
            # and then sample it using for example `np.random.normal`. Do not
            # forget to clip the actions to the `env.action_space.{low,high}`
            # range, for example using `np.clip`.
            actions = None

            # TODO(paac): Perform steps in the vectorized environment

            # TODO(paac): Compute estimates of returns by one-step bootstrapping

            # TODO(paac): Train network using current states, chosen actions and estimated returns

        # Periodic evaluation
        for _ in range(args.evaluate_for):
            evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(
        wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed)

    main(env, args)
Beispiel #9
0
        # TODO: Perform evaluation of a trained model.

        while True:
            state, done = env.reset(start_evaluation=True), False
            while not done:
                # TODO: Choose an action
                action = None
                state, reward, done, _ = env.step(action)

    else:
        # TODO: Perform training

        # If you want to create N multiprocessing parallel environments, use
        #   vector_env = gym.vector.AsyncVectorEnv([lambda: gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip))] * N)
        #   vector_env.seed(args.seed) # The individual environments will get incremental seeds

        pass


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("CarRacingSoftFS{}-v0".format(
        args.frame_skip)),
                                 args.seed,
                                 evaluate_for=15,
                                 report_each=1)

    main(env, args)
Beispiel #10
0
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: Perform an action.
            action = None

            next_state, reward, done, _ = env.step(action)

            # TODO: Update the action-value estimates

            state = next_state

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed)

    main(env, args)
Beispiel #11
0
def main(args: argparse.Namespace) -> np.ndarray:
    # Create a random generator with a fixed seed
    generator = np.random.RandomState(args.seed)

    # Create the environment
    env = wrappers.EvaluationEnv(gym.make("Taxi-v3"),
                                 seed=args.seed,
                                 report_each=min(200, args.episodes))

    Q = np.zeros((env.observation_space.n, env.action_space.n))

    # The next action is always chosen in the epsilon-greedy way.
    def choose_next_action(Q: np.ndarray) -> tuple[int, float]:
        greedy_action = argmax_with_tolerance(Q[next_state])
        next_action = greedy_action if generator.uniform(
        ) >= args.epsilon else env.action_space.sample()
        return next_action, args.epsilon / env.action_space.n + (
            1 - args.epsilon) * (greedy_action == next_action)

    # The target policy is either the behavior policy (if not args.off_policy),
    # or the greedy policy (if args.off_policy).
    def compute_target_policy(Q: np.ndarray) -> np.ndarray:
        target_policy = np.eye(env.action_space.n)[argmax_with_tolerance(
            Q, axis=-1)]
        if not args.off_policy:
            target_policy = (
                1 - args.epsilon
            ) * target_policy + args.epsilon / env.action_space.n
        return target_policy

    # Run the TD algorithm
    for _ in range(args.episodes):
        next_state, done = env.reset(), False

        # Generate episode and update Q using the given TD method
        next_action, next_action_prob = choose_next_action(Q)
        while not done:
            action, action_prob, state = next_action, next_action_prob, next_state
            next_state, reward, done, _ = env.step(action)
            if not done:
                next_action, next_action_prob = choose_next_action(Q)

            # TODO: Perform the update to the state-action value function `Q`, using
            # a TD update with the following parameters:
            # - `args.n`: use `args.n`-step method
            # - `args.off_policy`:
            #    - if False, the epsilon-greedy behaviour policy is also the target policy
            #    - if True, the target policy is the greedy policy
            #      - for SARSA (with any `args.n`) and expected SARSA (with `args.n` > 1),
            #        importance sampling must be used
            # - `args.mode`: this argument can have the following values:
            #   - "sarsa": regular SARSA algorithm
            #   - "expected_sarsa": expected SARSA algorithm
            #   - "tree_backup": tree backup algorithm
            #
            # Perform the updates as soon as you can -- whenever you have all the information
            # to update `Q[state, action]`, do it. For each `action` use its corresponding
            # `action_prob` at the time of taking the `action` as the behaviour policy probability,
            # and the `compute_target_policy(Q)` with the current `Q` as the target policy.
            #
            # Do not forget that when `done` is True, bootstrapping on the
            # `next_state` is not used.
            #
            # Also note that when the episode ends and `args.n` > 1, there will
            # be several state-action pairs that also need to be updated. Perform
            # the updates in the order in which you encountered the state-action
            # pairs and during these updates, use the `compute_target_policy(Q)`
            # with the up-to-date value of `Q`.

    return Q
Beispiel #12
0
    while training:
        # To generate expert trajectory, you can use
        state, trajectory = env.expert_trajectory()

        # TODO: Perform a training episode
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode and env.episode % args.render_each == 0:
                env.render()

            state, reward, done, _ = env.step(action)

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(
        wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")),
        args.seed)

    main(env, args)
Beispiel #13
0
        if args.render_each and env.episode % args.render_each == 0:
            # Produce an HTML visualization using all the stored states.
            env.render("html", path="{}{}.html".format(args.env, env.episode))
        return rewards

    # Evaluation in ReCodEx
    if args.recodex:
        while True:
            evaluate_episode(start_evaluation=True)

    # TODO: Perform training.
    #
    # Note that the SAC had issues with exploding gradients (the model started
    # to predict NaNs after several updates); the problem went away after
    # passing `clipnorm=10` to the `tf.optimizers.Adam`. Note that the
    # value `10` is my first try and definitely not an optimal value.
    #
    # Vectorized Brax environment can be created using
    #   venv = wrappers.BraxWrapper(args.env, workers=args.threads)
    raise NotImplementedError()


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(wrappers.BraxWrapper(args.env), args.seed)

    main(env, args)
Beispiel #14
0
            action = None

            # Perform the action.
            next_state, reward, done, _ = env.step(action)

            states.append(state)
            actions.append(action)
            rewards.append(reward)

            state = next_state

        # TODO: Compute returns from the received rewards and update Q and C.

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose a greedy action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(
        wrappers.DiscreteCartPoleWrapper(gym.make("CartPole-v1")), args.seed)

    main(env, args)