Ejemplo n.º 1
0
def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None:
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Construct the network
    network = Network(env, args)

    def evaluate_episode(start_evaluation: bool = False) -> float:
        rewards, state, done = 0, env.reset(start_evaluation), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: Predict the action using the greedy policy
            action = None
            state, reward, done, _ = env.step(action)
            rewards += reward
        return rewards

    # Create the vectorized environment
    vector_env = gym.vector.AsyncVectorEnv([
        lambda: wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles)
    ] * args.workers)
    vector_env.seed(args.seed)
    states = vector_env.reset()

    training = True
    while training:
        # Training
        for _ in range(args.evaluate_each):
            # TODO: Predict action distribution using `network.predict_actions`
            # and then sample it using for example `np.random.normal`. Do not
            # forget to clip the actions to the `env.action_space.{low,high}`
            # range, for example using `np.clip`.
            actions = None

            # TODO(paac): Perform steps in the vectorized environment

            # TODO(paac): Compute estimates of returns by one-step bootstrapping

            # TODO(paac): Train network using current states, chosen actions and estimated returns

        # Periodic evaluation
        for _ in range(args.evaluate_for):
            evaluate_episode()

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)
Ejemplo n.º 2
0
def main(env, args):
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Construct the network
    network = Network(env, args)
    weights = env.observation_space.nvec[-1]

    def evaluate_episode(start_evaluation=False):
        rewards, state, done = 0, env.reset(start_evaluation), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            state = multi_hot_states([state], weights)
            action = network.predict_actions(state)[0][0]
            state, reward, done, _ = env.step(action)
            rewards += reward
        return rewards

    # Create the vectorized environment
    vector_env = gym.vector.AsyncVectorEnv([
        lambda: wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles)
    ] * args.workers)
    vector_env.seed(args.seed)
    states = vector_env.reset()
    states = multi_hot_states(states, weights)

    training = True
    while training:
        # Training
        for _ in range(args.evaluate_each):
            # TODO: Predict action distribution using `network.predict_actions`
            # and then sample it using for example `np.random.normal`. Do not
            # forget to clip the actions to the `env.action_space.{low,high}`
            # range, for example using `np.clip`.

            mus, sds = network.predict_actions(states)

            actions = np.random.normal(mus, sds)
            actions = np.clip(actions, env.action_space.low,
                              env.action_space.high)

            # TODO(paac): Perform steps in the vectorized environment
            next_states, rewards, dones, _ = vector_env.step(actions)
            next_states = multi_hot_states(next_states, weights)

            # TODO(paac): Compute estimates of returns by one-step bootstrapping
            predicted_values = network.predict_values(next_states)
            returns = rewards + (args.gamma * np.array([
                0 if done else pred
                for done, pred in zip(dones, predicted_values)
            ]))

            # TODO(paac): Train network using current states, chosen actions and estimated returns
            network.train(states, actions, returns)

            states = next_states

        # Periodic evaluation
        total_reward = []
        for _ in range(args.evaluate_for):
            total_reward.append(evaluate_episode())
        print(
            f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}'
        )
        if np.mean(total_reward) > 90:
            training = False

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)
Ejemplo n.º 3
0
            # TODO(paac): Train network using current states, chosen actions and estimated returns
            network.train(states, actions, returns)

            states = next_states

        # Periodic evaluation
        total_reward = []
        for _ in range(args.evaluate_for):
            total_reward.append(evaluate_episode())
        print(
            f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}'
        )
        if np.mean(total_reward) > 90:
            training = False

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed)

    main(env, args)
Ejemplo n.º 4
0
    try:
        # Final evaluation
        returns = []
        while True:
            state, done = env.reset(start_evaluation=True), False

            r = 0
            while not done:
                action = np.argmax(W[state].sum(axis=0))
                state, reward, done, _ = env.step(action)
                r += reward
            returns.append(r)
    except KeyboardInterrupt:
        if not args.recodex:
            np.save(f"{sum(returns)}_{args.tiles}_W_matrix.npy", W)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment

    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0"),
                                            tiles=args.tiles),
        args.seed,
        logname=
        f"{args.logdir}/alpha={args.alpha},alpha_dec={args.alpha_dec},epsilon={args.epsilon},epsilon_final={args.epsilon_final},epsilon_final_at={args.epsilon_final_at},episodes={args.episodes},tiles={args.tiles},gamma={args.gamma},seed={args.seed}"
    )
    main(env, args)
Ejemplo n.º 5
0
def main(env, args):
    global vector_env
    # Fix random seeds and number of threads
    np.random.seed(args.seed)
    tf.random.set_seed(args.seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Construct the network
    network = Network(env, args)

    def evaluate_episode(start_evaluation=False):
        rewards, state, done = 0, env.reset(start_evaluation), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            mus, sds = network.predict_actions([state])
            mu, sd = mus[0], sds[0]
            # action = np.clip(np.random.normal(mu, sd), -1, 1)
            action = np.clip(mu, -1, 1)

            return_estimate = network.predict_values([state])[0]

            # print(f"mu:\t{mu}\tsd:\t{sd}\taction:\t{action}\treturn_est:\t{return_estimate}")
            # mus, _ = network.predict_actions([state])

            state, reward, done, _ = env.step([action])
            rewards += reward
        return rewards

    # Create the vectorized environment
    vector_env = gym.vector.AsyncVectorEnv([
        lambda: wrappers.DiscreteMountainCarWrapper(
            gym.make("MountainCarContinuous-v0"), tiles=args.tiles)
    ] * args.workers)
    vector_env.seed(args.seed)
    states = vector_env.reset()

    training = True
    while training:
        # Training
        for _ in range(args.evaluate_each):
            # TODO: Predict action distribution using `network.predict_actions`
            # and then sample it using for example `np.random.normal`. Do not
            # forget to clip the actions to the `env.action_space.{low,high}`
            # range, for example using `np.clip`.
            mus, sds = network.predict_actions(states)
            actions = np.reshape(np.random.normal(mus, sds), (args.workers, 1))
            # print(actions)

            # TODO(paac): Perform steps in the vectorized environment
            next_states, rewards, dones, _ = vector_env.step(
                np.clip(actions, -1, 1))

            # rewards -= 1

            # TODO(paac): Compute estimates of returns by one-step bootstrapping
            predicted_values = network.predict_values(next_states)
            return_estimates = rewards + (args.gamma * np.array([
                0 if done else pred
                for done, pred in zip(dones, predicted_values)
            ]))

            # TODO(paac): Train network using current states, chosen actions and estimated returns
            network.train(states, actions, return_estimates)
            states = next_states

        # Periodic evaluation
        for _ in range(args.evaluate_for):
            evaluate_episode()

        if sum(env._episode_returns[-100:]) / min(100, len(
                env._episode_returns)) > 90:
            training = False

    # Final evaluation
    while True:
        evaluate_episode(start_evaluation=True)
Ejemplo n.º 6
0
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: Perform an action.
            action = None

            next_state, reward, done, _ = env.step(action)

            # TODO: Update the action-value estimates

            state = next_state

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = None
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationEnv(wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed)

    main(env, args)
Ejemplo n.º 7
0
            next_state, reward, done, _ = env.step(action)

            # TODO: Update the action-value estimates
            q[state, action] = q[state, action] + (alpha_schedule(
                args, e) if args.decrease_alpha else args.alpha) * (
                    reward + args.gamma * np.max(q[next_state]) -
                    q[state, action])
            state = next_state

    # Final evaluation
    while True:
        state, done = env.reset(start_evaluation=True), False
        while not done:
            # TODO: Choose (greedy) action
            action = np.argmax(q[state])
            state, reward, done, _ = env.step(action)


if __name__ == "__main__":
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Create the environment
    env = wrappers.EvaluationWrapper(
        wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")),
        args.seed,
        logname=
        f"alpha={args.alpha},epsilon={args.epsilon},gamma={args.gamma},init_bias={args.init_bias},de={args.decrease_epsilon},da={args.decrease_alpha},seed={args.seed}",
        evaluate_for=100)

    main(env, args)