예제 #1
0
    def target_fn(x) -> float:
        env = cart_pole_evaluator.environment()

        epsilon, epsilon_final, gamma = x
        args.epsilon = epsilon
        args.epsilon_final = epsilon_final
        args.gamma = gamma

        Q = np.zeros((env.states, env.actions), dtype=np.float32)
        C = np.zeros_like(Q)

        train(args, env, Q, C)

        # Perform last 100 evaluation episodes
        mean_value = evaluate(args, env, Q)

        return -mean_value
예제 #2
0
                        type=int,
                        help="Render some episodes.")
    parser.add_argument("--threads",
                        default=0,
                        type=int,
                        help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Fix random seed
    np.random.seed(42)
    tf.random.set_seed(42)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create the environment
    env = cart_pole_evaluator.environment(discrete=False)

    # Construct the network
    network = Network(env, args)

    # Training
    for _ in range(args.episodes // args.batch_size):
        batch_states, batch_actions, batch_returns = [], [], []
        for _ in range(args.batch_size):
            # Perform episode
            states, actions, rewards = [], [], []
            state, done = env.reset(), False
            while not done:
                if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                    env.render()
예제 #3
0
                        help="Training episodes.")
    parser.add_argument("--epsilon",
                        default=0.2,
                        type=float,
                        help="Exploration factor.")
    parser.add_argument("--render_each",
                        default=0,
                        type=int,
                        help="Render some episodes.")
    args = parser.parse_args()

    # Fix random seed
    np.random.seed(42)

    # Create the environment
    env = cart_pole_evaluator.environment(discrete=True)

    # Create Q, C and other variables
    # TODO:
    # - Create Q, a zero-filled NumPy array with shape [env.states, env.actions],
    #   representing estimated Q value of a given (state, action) pair.
    # - Create C, a zero-filled NumPy array with shape [env.states, env.actions],
    #   representing number of observed returns of a given (state, action) pair.
    Q = np.zeros([env.states, env.actions])
    C = np.zeros([env.states, env.actions])

    for _ in range(args.episodes):
        # Perform episode
        state = env.reset()
        states, actions, rewards = [], [], []
        while True:
예제 #4
0
    # Parse arguments
    parser = argparse.ArgumentParser()
    # TODO: Define reasonable defaults and optionally more parameters
    parser.add_argument("--episodes", default=600, type=int, help="Training episodes.")
    parser.add_argument("--epsilon", default=0.5, type=float, help="Exploration factor.")
    parser.add_argument("--gamma", default=0.3, type=float, help="Discount factor of the rewards.")
    parser.add_argument("--recodex", default=False, action="store_true", help="Evaluation in ReCodEx.")
    parser.add_argument("--render_each", default=50, type=int, help="Render some episodes.")
    parser.add_argument("--seed", default=42, type=int, help="Random seed.")
    args = parser.parse_args([] if "__file__" not in globals() else None)

    # Fix random seeds and threads
    np.random.seed(args.seed)

    # Create the environment
    env = cart_pole_evaluator.environment(discrete=True, seed=args.seed)

    # Create Q, C and other variables
    # TODO:
    # - Create Q, a zero-filled NumPy array with shape [env.states, env.actions],
    #   representing estimated Q value of a given (state, action) pair.
    # - Create C, a zero-filled NumPy array with shape [env.states, env.actions],
    #   representing number of observed returns of a given (state, action) pair.

    Q = np.zeros([env.states, env.actions])
    C = np.zeros([env.states, env.actions])


    for _ in range(args.episodes):
        # Perform episode
        state = env.reset()
예제 #5
0
    parser.add_argument("--epsilon",
                        default=0.15,
                        type=float,
                        help="Exploration factor.")
    parser.add_argument("--epsilon_final",
                        default=0.001,
                        type=float,
                        help="Final exploration factor.")
    parser.add_argument("--gamma",
                        default=0.99,
                        type=float,
                        help="Discounting factor.")
    args = parser.parse_args()

    # Create the environment
    env = cart_pole_evaluator.environment()
    env2 = cart_pole_evaluator.environment()
    env3 = cart_pole_evaluator.environment()

    Q = np.zeros((env.states, env.actions))
    C = np.zeros((env.states, env.actions))
    Q2 = np.zeros((env.states, env.actions))
    C2 = np.zeros((env.states, env.actions))
    Q3 = np.zeros((env.states, env.actions))
    C3 = np.zeros((env.states, env.actions))

    Qs = ((Q, C, env), (Q2, C2, env2), (Q3, C3, env3))
    #a = (args.epsilon_final/args.epsilon) ** (1/max(args.episodes*0.9, 1000))
    #a = (args.epsilon_final - args.epsilon)/(args.episodes * 0.9)
    d = args.epsilon_final
    c = args.epsilon
예제 #6
0
def main(args,seed):
    # Fix random seeds and number of threads
    np.random.seed(seed)
    tf.random.set_seed(seed)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create the environment
    env = cart_pole_evaluator.environment(discrete=False)
    # env2 = cart_pole_evaluator.environment(discrete=False)
    # print(env.actions)
    # print(env.state_shape)
    # print(env.action_shape)

    # Construct the network
    network = Network(env, args)
    A = np.array(range(env.actions))
    N = args.episodes // args.batch_size

    training = True

    # Training
    for n in range(N):
        batch_states, batch_actions, batch_returns = [], [], []
        for _ in range(args.batch_size):
            print('Episode {}/{}'.format(env.episode+1, args.episodes))
            # Perform episode
            states, actions, rewards = [], [], []
            state, done = env.reset(), False

            while not done:
                if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                    env.render()

                # Compute action probabilities using `network.predict` and current `state`
                probabilities = network.predict([state])[0]
                S = sum(probabilities)

                # Choose `action` according to `probabilities` distribution (np.random.choice can be used)
                action = np.random.choice(A, p=probabilities/S)
                next_state, reward, done, _ = env.step(action)

                states.append(state)
                actions.append(action)
                rewards.append(reward)

                state = next_state

            # Compute returns by summing rewards (with discounting)

            G = 0
            Gs = []
            rewards.reverse()
            for r in rewards:
                G = r + args.gamma * G
                Gs.append(G)
            Gs.reverse()

            # Add states, actions and returns to the training batch

            batch_states.append(states)
            batch_actions.append(actions)
            batch_returns.append(Gs)

        # print('Reward {} -- mean[-10:] {}'.format(env._episode_returns[-1], np.mean(env._episode_returns[-10:])))

        # print('Last return: {}'.format(round(np.mean(env._episode_returns[-args.batch_size:]), 2)))

        if round(np.mean(env._episode_returns[-10:]), 2) > 460:
            training = False

        if not training:
            break

        # Train using the generated batch
        network.train(
            batch_states,
            batch_actions,
            batch_returns
        )
        # print('Training {}/{} done in {}s'.format(n+1, N, round(time.time() - T, 2)))

    # Final evaluation
    while True:
        state, done = env.reset(True), False
        # R = 0
        while not done:
            # Compute action `probabilities` using `network.predict` and current `state`

            # Choose greedy action this time
            probabilities = network.predict([state])[0]
            action = np.argmax(probabilities)
            state, reward, done, _ = env.step(action)
            # R += reward
    return np.mean(env._episode_returns[-100:])
예제 #7
0
                        default=0.005,
                        type=float,
                        help="Final exploration factor.")
    parser.add_argument("--gamma",
                        default=1.0,
                        type=float,
                        help="Discounting factor.")
    args = parser.parse_args()

    # stuff for my own learning control
    rewards_history = []
    target_reward = 500
    treshold_reward = 485

    # Create the environment
    env = cpe.environment()

    # init args
    eps = args.epsilon
    gamma = args.gamma

    # init policy
    policy = np.zeros((env.states, env.actions)) + 1 / env.actions

    # Combines Q and Return from Alg, remembers average return for this (state, act) combination
    avgReturn = np.zeros((env.states, env.actions), dtype=float)
    stateActionSeen = np.zeros((env.states, env.actions), dtype=int)

    #
    # Could be improved with addition of automatic policy search resets after
    # ..the learning process stops improving for certain number of episodes.
예제 #8
0
class Network:
    def __init__(self, env, args):
        # TODO: Create a suitable network

        # Warning: If you plan to use Keras `.train_on_batch` and/or `.predict_on_batch`
        # methods, pass `experimental_run_tf_function=False` to compile. There is
        # a bug in TF 2.0 which causes the `*_on_batch` methods not to use `tf.function`.

        # Otherwise, if you are training manually, using `tf.function` is a good idea
        # to get good performance.

    # Define a training method. Generally you have two possibilities
    # - pass new q_values of all actions for a given state; all but one are the same as before
    # - pass only one new q_value for a given state, including the index of the action to which
    #   the new q_value belongs
    def train(self, states, ...):
        # TODO
        pass

    def predict(self, states):
        # TODO
        pass


if __name__ == "__main__":
    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", default=32, type=int, help="Batch size.")
    parser.add_argument("--episodes", default=1000, type=int, help="Episodes for epsilon decay.")
    parser.add_argument("--epsilon", default=0.3, type=float, help="Exploration factor.")
    parser.add_argument("--epsilon_final", default=0.01, type=float, help="Final exploration factor.")
    parser.add_argument("--gamma", default=1.0, type=float, help="Discounting factor.")
    parser.add_argument("--hidden_layers", default=1, type=int, help="Number of hidden layers.")
    parser.add_argument("--hidden_layer_size", default=20, type=int, help="Size of hidden layer.")
    parser.add_argument("--learning_rate", default=0.001, type=float, help="Learning rate.")
    parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Fix random seeds and number of threads
    np.random.seed(42)
    tf.random.set_seed(42)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create the environment
    env = cart_pole_evaluator.environment(discrete=False)

    # Construct the network
    network = Network(env, args)

    # Replay memory; maxlen parameter can be passed to deque for a size limit,
    # which we however do not need in this simple task.
    replay_buffer = collections.deque()
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"])

    evaluating = False
    epsilon = args.epsilon
    while training:
        # Perform episode
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            # TODO: compute action using epsilon-greedy policy. You can compute
            # the q_values of a given state using
            #   q_values = network.predict(np.array([state], np.float32))[0]

            next_state, reward, done, _ = env.step(action)

            # Append state, action, reward, done and next_state to replay_buffer
            replay_buffer.append(Transition(state, action, reward, done, next_state))

            # TODO: If the replay_buffer is large enough, preform a training batch
            # of `args.batch_size` uniformly randomly chosen transitions.
            #
            # After you choose `states` and suitable targets, you can train the network as
            #   network.train(states, ...)

            state = next_state

        if args.epsilon_final:
            epsilon = np.exp(np.interp(env.episode + 1, [0, args.episodes], [np.log(args.epsilon), np.log(args.epsilon_final)]))

    # Final evaluation
    while True:
        state, done = env.reset(True), False
        while not done:
            action = np.argmax(network.predict(np.array([state], np.float32))[0])
            state, reward, done, _ = env.step(action)
예제 #9
0
def main():
    # Fix random seed
    np.random.seed(42)

    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--episodes",
                        default=500,
                        type=int,
                        help="Training episodes.")
    parser.add_argument("--render_each",
                        default=0,
                        type=int,
                        help="Render some episodes.")

    parser.add_argument("--epsilon",
                        default=0.2,
                        type=float,
                        help="Exploration factor.")
    parser.add_argument("--epsilon_final",
                        default=0.1,
                        type=float,
                        help="Final exploration factor.")
    parser.add_argument("--gamma",
                        default=0.99,
                        type=float,
                        help="Discounting factor.")
    args = parser.parse_args()

    print(args)

    # Create the environment
    env = cart_pole_evaluator.environment()

    training = True

    Q = np.zeros([env.states, env.actions], dtype=np.float32)
    Q.fill(500)
    # Q.fill(1 / args.epsilon)

    C = np.zeros([env.states, env.actions], dtype=np.float32)

    eps_diff = (args.epsilon_final - args.epsilon) / float(args.episodes)
    eps_curr = args.epsilon

    while training:
        trajectory = []

        # Perform a training episode
        state, done = env.reset(), False
        while not done:
            if args.render_each and env.episode and env.episode % args.render_each == 0:
                env.render()

            if random.random() < eps_curr:
                action = random.randint(0, env.actions - 1)
            else:
                action = np.argmax(Q[state]).item()

            next_state, reward, done, _ = env.step(action)

            trajectory.append([state, action, reward])

            state = next_state

        G = 0.0

        for state, action, reward in reversed(trajectory):
            G = args.gamma * G + reward
            # returns[(state, action)].append(G)
            # Q[state, action] = np.mean(returns[(state, action)]).item()

            C[state, action] += 1
            Q[state, action] += (G - Q[state, action]) / C[state, action]

            state = next_state

        eps_curr += eps_diff

        if args.render_each and env.episode % args.render_each == 0:
            print(f"eps curr: {eps_curr}")

            # Evaluation episode
            state, done = env.reset(), False
            while not done:
                env.render()
                action = np.argmax(Q[state]).item()
                state, _, done, _ = env.step(action)

        if env.episode > args.episodes:
            break

    # Perform last 100 evaluation episodes
    for _ in range(100):
        state, done = env.reset(True), False
        while not done:
            action = np.argmax(Q[state]).item()
            state, _, done, _ = env.step(action)