Example #1
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """

    with tf.device('/gpu:0'):
        res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
        scores = []  # Cumulative rewards
        steps = []  # Steps per episode

        reward_list = RingBuffer(100)
        env = gym.make('PongDeterministic-v4')

        input_dim = env.observation_space.shape[0]
        output_dim = env.action_space.n

        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0.05,
                             epsilon_lower_bound=0.05)
        else:
            layers = [
                Conv2D(32, (8, 8),
                       strides=(4, 4),
                       activation='relu',
                       input_shape=(84, 84, 4),
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (4, 4),
                       strides=(2, 2),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Conv2D(64, (3, 3),
                       strides=(1, 1),
                       activation='relu',
                       kernel_initializer=VarianceScaling(scale=2.0)),
                Flatten(),
                Dense(512, activation='relu'),
                Dense(output_dim)
            ]
            agent = DQNAgent(output_dim,
                             layers,
                             use_ddqn=True,
                             memory_size=700000,
                             gamma=0.99,
                             learn_thresh=50000,
                             epsilon_lower_bound=0.02,
                             epsilon_decay_function=lambda e: e -
                             (0.98 / 950000),
                             update_rate=10000,
                             optimizer=Adam(0.00025))

        gathered_frame = 0
        for episode_number in tqdm(range(n_episodes), desc="Episode"):
            frame = env.reset()
            state = pre_processing(frame)
            empty_state = np.zeros(state.shape, dtype="uint8")
            cumulative_reward = 0

            has_lost_life = True

            t = 0
            while True:
                if has_lost_life:
                    next_action = 1  # [1, 4, 5][ran.randint(0, 2)]

                    stack = np.stack(
                        (empty_state, empty_state, empty_state, empty_state),
                        axis=2)
                    stack = np.reshape([stack], (1, 84, 84, 4))

                    for _ in range(ran.randint(1, 10)):
                        gathered_frame += 1
                        frame, reward, end, _ = env.step(next_action)
                        new_state = np.reshape(pre_processing(frame),
                                               (1, 84, 84, 1))
                        new_stack = np.append(new_state,
                                              stack[:, :, :, :3],
                                              axis=3)
                        stack = new_stack

                        if (render):
                            env.render()

                    has_lost_life = False

                next_action = agent.act(stack)
                new_state, reward, end, _ = env.step(next_action)

                if (render):
                    env.render()
                    time.sleep(0.02)

                reward = np.clip(reward, -1., 1.)

                if reward != 0:
                    has_lost_life = True

                cumulative_reward += reward

                new_state = np.reshape(pre_processing(new_state),
                                       (1, 84, 84, 1))
                new_stack = np.append(new_state, stack[:, :, :, :3], axis=3)
                agent.memoise(
                    (stack, next_action, reward, new_state, has_lost_life))

                stack = new_stack
                gathered_frame += 1

                if end:
                    reward_list.append(cumulative_reward)
                    if cumulative_reward > 0:
                        res[1] += 1
                        print("You Won!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    else:
                        res[0] += 1
                        print("You Lost!, steps:", t, "reward:",
                              reward_list.mean(), "frames:", gathered_frame)
                    steps.append(t)
                    break

                agent.learn()
                t += 1

            scores.append(cumulative_reward)
            if episode_number >= 50 and episode_number % 10 == 0:
                model_name = "partial_model_pong" + str(episode_number)
                agent.save_model(model_name)

        env.close()
        return {
            "results": np.array(res),
            "steps": np.array(steps),
            "scores": np.array(scores),
            "agent": agent
        }
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # Steps per episode

    env = gym.make('MountainCar-v0')
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    layer1 = Dense(15, input_dim=input_dim, activation='relu')
    layer2 = Dense(output_dim)

    agent1 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=True,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)
    #agent2 = QLAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01)
    #agent3 = SARSAAgent([n_states, n_states, env.action_space.n], epsilon_decay_function=lambda e: e - 0.001, epsilon_lower_bound=0.01)
    agent4 = DQNAgent(output_dim, [layer1, layer2],
                      use_ddqn=False,
                      learn_thresh=1000,
                      update_rate=300,
                      epsilon_decay_function=lambda e: e - 0.001,
                      epsilon_lower_bound=0.01,
                      optimizer=keras.optimizers.RMSprop(0.001),
                      tb_dir=None)

    agents = [agent1, agent4]
    agentE = EnsemblerAgent(env.action_space.n, agents,
                            EnsemblerType.TRUST_BASED)

    evaluate = False

    for i_episode in tqdm(range(n_episodes + 1), desc="Episode"):
        state = env.reset()
        # agent3.extract_policy()
        discretized_state = obs_to_state(env, state, n_states)
        cumulative_reward = 0

        state = np.reshape(state, [1, 2])

        if i_episode > 0 and i_episode % 100 == 0:
            evaluate = True

        if evaluate == False:
            for t in range(env._max_episode_steps):
                if (render):
                    env.render()

                next_action = agentE.act(state, discretized_state)
                new_state, reward, end, _ = env.step(next_action)
                new_discretized_state = obs_to_state(env, new_state, n_states)
                original_state = new_state

                # Reward shaping
                # r1 = reward + 0.1 * original_state[0]
                # r2 = reward + 0.2 * np.sin(3 * original_state[0])
                # r3 = reward + 0.7 * (original_state[1] * original_state[1])

                r1 = reward + original_state[0]
                r2 = reward + np.sin(3 * original_state[0])
                r3 = reward + (original_state[1] * original_state[1])
                r4 = abs(new_state[0] - (-0.5))  # r in [0, 1]

                new_state = np.reshape(new_state, [1, 2])

                agent1.memoise((state, next_action, r4, new_state, end))
                #agent2.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
                #agent3.update_q((discretized_state[0], discretized_state[1]), (new_discretized_state[0], new_discretized_state[1]), next_action, reward)
                agent4.memoise((state, next_action, r4, new_state, end))

                if end:
                    if t == env._max_episode_steps - 1:
                        res[0] += 1
                    else:
                        res[1] += 1
                        print("ENTRATO!,", t, "steps", "reward: ",
                              cumulative_reward)

                    steps.append(t)
                    break
                else:
                    state = new_state
                    discretized_state = new_discretized_state
                    cumulative_reward += reward

                agent1.learn()
                agent4.learn()

            cumulative_reward += reward
            scores.append(cumulative_reward)
        else:
            evaluate = False
            eval_res = [
                0, 0
            ]  # array of results accumulator: {[0]: Loss, [1]: Victory}
            eval_scores = []  # Cumulative rewards
            eval_steps = []  # Steps per episode

            for i_episode in range(100):
                state = env.reset()
                discretized_state = obs_to_state(env, state, n_states)

                state = np.reshape(state, [1, 2])
                cumulative_reward = 0

                for t in range(env._max_episode_steps):
                    if (render):
                        env.render()

                    next_action = agentE.act(state, discretized_state)
                    new_state, reward, end, _ = env.step(next_action)
                    new_discretized_state = obs_to_state(
                        env, new_state, n_states)
                    original_state = new_state
                    new_state = np.reshape(new_state, [1, 2])

                    if end:
                        if t == env._max_episode_steps - 1:
                            eval_res[0] += 1
                        else:
                            eval_res[1] += 1

                        eval_steps.append(t)
                        break
                    else:
                        state = new_state
                        discretized_state = new_discretized_state
                        cumulative_reward += reward

                cumulative_reward += reward
                eval_scores.append(cumulative_reward)

            testing_accuracy = accuracy(np.array(eval_res))
            testing_mean_steps = np.array(eval_steps).mean()
            testing_mean_score = np.array(eval_scores).mean()
            print("\nTraining episodes:", len(steps), "Training mean score:",
                  np.array(steps).mean(), "Training mean steps",
                  np.array(scores).mean(), "\nAccuracy:", testing_accuracy,
                  "Test mean score:", testing_mean_score, "Test mean steps:",
                  testing_mean_steps)

    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores)
    }
Example #3
0
def experiment(n_episodes, default_policy=False, policy=None, render=False):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # steps per episode

    env = gym.make('CartPole-v0')
    env = env.unwrapped
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    layer1 = Dense(10, input_dim=input_dim, activation='relu')
    layer2 = Dense(output_dim)

    if default_policy:
        agent = DQNAgent(output_dim,
                         None,
                         use_ddqn=True,
                         default_policy=True,
                         model_filename=policy,
                         epsilon=0,
                         epsilon_lower_bound=0,
                         learn_thresh=0)
    else:
        agent1 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent2 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent3 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent4 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent5 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent6 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent7 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent8 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent9 = DQNAgent(output_dim, [layer1, layer2],
                          use_ddqn=True,
                          learn_thresh=2000,
                          update_rate=100,
                          epsilon_decay_function=lambda e: e - 0.0001,
                          epsilon_lower_bound=0.1,
                          optimizer=keras.optimizers.RMSprop(0.001),
                          memory_size=2000,
                          tb_dir=None)
        agent10 = DQNAgent(output_dim, [layer1, layer2],
                           use_ddqn=True,
                           learn_thresh=2000,
                           update_rate=100,
                           epsilon_decay_function=lambda e: e - 0.0001,
                           epsilon_lower_bound=0.1,
                           optimizer=keras.optimizers.RMSprop(0.001),
                           memory_size=2000,
                           tb_dir=None)
        agents = [
            agent1, agent2, agent3, agent4, agent5, agent6, agent7, agent8,
            agent9, agent10
        ]

        agentE = EnsemblerAgent(output_dim, agents, EnsemblerType.TRUST_BASED)

    for i_ep in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        state = np.reshape(state, [1, 4])

        t = 0
        while True:
            if (render):
                env.render()
                time.sleep(0.1)

            next_action = agentE.act(state)

            new_state, reward, end, _ = env.step(next_action)

            x, x_dot, theta, theta_dot = new_state
            new_state = np.reshape(new_state, [1, 4])

            # Reward shaping
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r3 = -abs(theta_dot)

            agent1.memoise((state, next_action, r2, new_state, end))
            agent2.memoise((state, next_action, r3, new_state, end))
            agent3.memoise((state, next_action, r2, new_state, end))
            agent4.memoise((state, next_action, r3, new_state, end))
            agent5.memoise((state, next_action, r2, new_state, end))
            agent6.memoise((state, next_action, r3, new_state, end))
            agent7.memoise((state, next_action, r2, new_state, end))
            agent8.memoise((state, next_action, r3, new_state, end))
            agent9.memoise((state, next_action, r2, new_state, end))
            agent10.memoise((state, next_action, r3, new_state, end))

            if end or t > 199:
                if t < 195:
                    res[0] += 1
                else:
                    res[1] += 1
                    print("ENTRATO!,", t, "steps", "reward: ",
                          cumulative_reward)

                steps.append(t)
                if i_ep % 100 == 0:
                    if evaluate(env, agentE):
                        cumulative_reward += reward
                        scores.append(cumulative_reward)
                        env.close()
                        return {
                            "results": np.array(res),
                            "steps": np.array(steps),
                            "scores": np.array(scores)
                        }

                break
            else:
                state = new_state
                cumulative_reward += reward

            for agent in agentE.agents:
                agent.learn()
            t += 1

        cumulative_reward += reward
        scores.append(cumulative_reward)

    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores)
    }
Example #4
0
def experiment(n_episodes,
               default_policy=False,
               policy=None,
               render=False,
               agent_config=None):
    """
    Run a RL experiment that can be either training or testing

    Args:
        n_episodes: number of train/test episodes
        default_policy: boolean to enable testing/training phase
        policy: numpy tensor with a trained policy
        render: enable OpenAI environment graphical rendering
        agent_config: DQNAgent object

    Returns:
        Dictionary with:
            cumulative experiments outcomes
            list of steps per episode
            list of cumulative rewards
            trained policy
    """
    res = [0, 0]  # array of results accumulator: {[0]: Loss, [1]: Victory}
    scores = []  # Cumulative rewards
    steps = []  # Steps per episode

    env = gym.make('MountainCar-v0')
    env.seed(seed)

    input_dim = env.observation_space.shape[0]
    output_dim = env.action_space.n

    if agent_config is None:
        if default_policy:
            agent = DQNAgent(output_dim,
                             None,
                             use_ddqn=True,
                             default_policy=True,
                             model_filename=policy,
                             epsilon=0,
                             epsilon_lower_bound=0,
                             learn_thresh=0)
        else:
            layer1 = Dense(15, input_dim=input_dim, activation='relu')
            layer2 = Dense(output_dim)
            agent = DQNAgent(output_dim, [layer1, layer2],
                             use_ddqn=True,
                             learn_thresh=1000,
                             update_rate=300,
                             epsilon_decay_function=lambda e: e * 0.95,
                             epsilon_lower_bound=0.01,
                             optimizer=keras.optimizers.RMSprop(0.001))
    else:
        agent = agent_config

    for i_episode in tqdm(range(n_episodes), desc="Episode"):
        state = env.reset()
        cumulative_reward = 0

        # Model validation for early stopping
        if i_episode > 0 and (i_episode % 100) == 0 and not default_policy:
            agent.save_model("tmp_model")
            evaluation_result = experiment(500,
                                           default_policy=True,
                                           policy="tmp_model")
            acc = accuracy(evaluation_result["results"])
            if acc == 100:
                break
            else:
                print("Accuracy:", acc, "Episode:", i_episode)

        state = np.reshape(state, [1, 2])

        for t in range(env._max_episode_steps):
            if (render):
                env.render()

            next_action = agent.act(state)
            new_state, reward, end, _ = env.step(next_action)

            reward = abs(new_state[0] - (-0.5))  # r in [0, 1] (reward shaping)
            new_state = np.reshape(new_state, [1, 2])

            agent.memoise((state, next_action, reward, new_state, end))

            if end:
                if t == env._max_episode_steps - 1:
                    res[0] += 1
                else:
                    res[1] += 1
                    # print("ENTRATO!,", t, "steps")

                steps.append(t)
                break
            else:
                state = new_state
                cumulative_reward += reward

            agent.learn()

        cumulative_reward += reward
        scores.append(cumulative_reward)
    env.close()
    return {
        "results": np.array(res),
        "steps": np.array(steps),
        "scores": np.array(scores),
        "agent": agent
    }