Esempio n. 1
0
def q_learning(sess,
               env,
               agent,
               num_episodes,
               max_time_per_episode,
               discount_factor=0.99,
               epsilon=0.4,
               epsilon_decay=.95,
               use_experience_replay=False,
               max_replay_buffer_size=4000,
               batch_size=128,
               target=None,
               tf_saver=None,
               save_path=None,
               save_interval=None):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    Implements the options of online learning or using experience replay and also
    target calculation by target networks, depending on the flags. You can reuse
    your Q-learning implementation of the last exercise.

    Args:
        env: PLE game
        approx: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        max_time_per_episode: maximum number of time steps before episode is terminated
        discount_factor: gamma, discount factor of future rewards.
        epsilon: Chance to sample a random action. Float betwen 0 and 1.
        epsilon_decay: decay rate of epsilon parameter
        use_experience_replay: Indicator if experience replay should be used.
        batch_size: Number of samples per batch.
        target: Slowly updated target network to calculate the targets. Ignored if None.

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                         episode_rewards=np.zeros(num_episodes))

    replay_buffer = ReplayBuffer(max_replay_buffer_size)
    action_set = env.getActionSet()

    for i_episode in range(num_episodes):

        # The policy we're following
        policy = make_epsilon_greedy_policy(agent.predict, len(action_set))

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        avg_reward = np.mean(stats.episode_rewards[max(i_episode -
                                                       100, 0):i_episode])
        print("\rEpisode {}/{} ({}), avg reward: {}".format(
            i_episode + 1, num_episodes, last_reward, avg_reward),
              end="")
        # sys.stdout.flush()

        # Reset the current environment
        env.reset_game()
        state = list(env.getGameState())
        done = False
        loss = None

        # Iterate through steps
        for t in range(max_time_per_episode):
            if env.game_over():
                done = True

            # Update target network maybe
            if target:
                pass

            # Take a step
            action_probs = policy([state], epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            reward = env.act(action_set[action])
            next_state = list(env.getGameState())

            # episode stats
            stats.episode_lengths[i_episode] = t
            # print(reward)
            stats.episode_rewards[i_episode] += reward

            if done:
                print("\rStep {} ({}) loss: {}\n".format(
                    t, max_time_per_episode, loss),
                      end="")
                break

            if use_experience_replay:
                # Update replay buffer
                replay_buffer.add_transition(state, action, next_state, reward,
                                             done)

                # Sample minibatch from replay buffer
                batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \
                    replay_buffer.next_batch(min(batch_size, replay_buffer.size()))

                batch_actions = list(
                    zip(range(len(batch_actions)), batch_actions))

                # Calculate TD target for batch. Use "old" fixed parameters if target network is available
                # to compute targets else use "old" parameters of value function estimate.
                batch_next_q_values = (target if target else
                                       agent.train_model).predict(
                                           batch_next_states, None, None)
                batch_best_next_action = np.argmax(batch_next_q_values, axis=1)
                batch_td_target = [
                    batch_rewards[j] + discount_factor *
                    batch_next_q_values[j][batch_best_next_action[j]]
                    for j in range(len(batch_states))
                ]

                # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
                loss = agent.train(batch_states, batch_actions,
                                   batch_td_target)
            else:
                next_q_values = (target if target else agent).predict(
                    [next_state], None, None)
                best_next_action = np.argmax(next_q_values, axis=1)
                td_target = reward + (discount_factor * next_q_values[0] *
                                      best_next_action)
                loss = agent.train([state], [[0, action]], td_target)

            if target:
                target.update()

            epsilon *= epsilon_decay
            state = next_state

        if i_episode % save_interval == 0:
            tf_saver.save(sess, save_path, global_step=i_episode)

    return stats
Esempio n. 2
0
def q_learning(q_network,
               env,
               test_env,
               seed,
               total_timesteps,
               log_interval,
               test_interval,
               show_interval,
               logdir,
               lr,
               max_grad_norm,
               units_per_hlayer,
               activ_fcn,
               gamma=0.95,
               epsilon=0.4,
               epsilon_decay=.95,
               buffer_size=4000,
               batch_size=128,
               trace_length=32,
               tau=0.99,
               update_interval=30,
               early_stop=False,
               keep_model=2,
               save_model=True,
               restore_model=False,
               save_traj=False):
    # """
    # Q-Learning algorithm for off-policy TD control using Function Approximation.
    # Finds the optimal greedy policy while following an epsilon-greedy policy.
    # Implements the options of online learning or using experience replay and also
    # target calculation by target networks, depending on the flags. You can reuse
    # your Q-learning implementation of the last exercise.
    #
    # Args:
    #     env: PLE game
    #     approx: Action-Value function estimator
    #     num_episodes: Number of episodes to run for.
    #     max_time_per_episode: maximum number of time steps before episode is terminated
    #     discount_factor: gamma, discount factor of future rewards.
    #     epsilon: Chance to sample a random action. Float betwen 0 and 1.
    #     epsilon_decay: decay rate of epsilon parameter
    #     use_experience_replay: Indicator if experience replay should be used.
    #     batch_size: Number of samples per batch.
    #     target: Slowly updated target network to calculate the targets. Ignored if None.
    #
    # Returns:
    #     An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    # """
    logger = logging.getLogger(__name__)
    # logger.info(datetime.time)

    tf.reset_default_graph()
    set_global_seeds(seed)

    # Params
    ob_space = env.observation_space
    ac_space = env.action_space
    nd, = ob_space.shape
    n_ac = ac_space.n

    # Create learning agent and the replay buffer
    agent = DQNAgent(q_network=q_network,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     lr=lr,
                     max_grad_norm=max_grad_norm,
                     units_per_hlayer=units_per_hlayer,
                     activ_fcn=activ_fcn,
                     log_interval=log_interval,
                     logdir=logdir,
                     batch_size=batch_size,
                     trace_length=trace_length,
                     update_interval=update_interval,
                     tau=tau,
                     keep_model=keep_model)
    summary_writer = agent.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None
    replay_buffer = ReplayBuffer(buffer_size)

    # Keeps track of useful statistics
    stats = EpisodeStats

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                logger.info('load %s' % os.path.join(logdir, el[:-5]))
                agent.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                agent.sess.run(agent.global_step.assign(0))

    # ------------------ TRAINING --------------------------------------------
    logger.info("Start Training")
    early_stopped = False
    i_episode, i_sample, i_train = 0, 0, 0
    len, rew = 0, 0
    horizon = 100
    reward_window = deque(maxlen=horizon)
    avg_rm = deque(maxlen=30)
    nbatch = batch_size * trace_length
    return_threshold = -0.05  # 40

    # Reset envnn
    obs = env.reset()
    obs = normalize_obs(obs)
    done = False
    rnn_state0 = agent.step_initial_state
    if rnn_state0 is None:  # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences.
        trace_length = 1

    # Set the target network to be equal to the primary network
    agent.update_target(agent.target_ops)
    while i_sample < total_timesteps:
        if np.random.rand(1) < epsilon:
            _, next_rnn_state = agent.step([obs],
                                           rnn_state0)  # epsilon greedy action
            action = np.random.randint(0, n_ac)
        else:
            AP, next_rnn_state = agent.step(
                [obs], rnn_state0)  # epsilon greedy action
            action = AP[0]
        next_obs, reward, done, _ = env.step(action)
        next_obs = normalize_obs(next_obs)
        i_sample += 1
        # render only every i-th episode
        if show_interval != 0:
            if i_episode % show_interval == 0:
                env.render()

        len += 1
        rew += reward
        reward_window.append(reward)

        # When episode is done, add episode information to tensorboard summary and stats
        if done:  # env.game_over():
            next_obs = list(np.zeros_like(next_obs, dtype=np.float32))

            stats['episode_lengths'].append(len)
            stats['episode_rewards'].append(rew)

            if summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(
                    tag='envs/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary.value.add(
                    tag="envs/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary, i_episode)
                summary_writer.flush()

            if save_model and rew > return_threshold:
                return_threshold = rew
                logger.info('Save model at max reward %s' % return_threshold)
                agent.save('inter_model')

            i_episode += 1
            len, rew = 0, 0

        # Update replay buffer
        replay_buffer.add_transition(obs, action, next_obs, reward, done)
        if save_traj:
            rew_traj.append(reward)

        # Update model parameters every #update_interval steps. Use real experience and replayed experience.
        if replay_buffer.size() > nbatch and (i_sample % update_interval == 0):
            if (env.spec._env_name == 'ContFlappyBird'):
                rm = sum(reward_window) / horizon
                if summary_writer is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(tag='envs/isample_return',
                                        simple_value=rm)
                    summary_writer.add_summary(s_summary, i_sample)
                    summary_writer.flush()
                if save_model and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    agent.save('inter_model')
                avg_rm.append(rm)

            if early_stop:
                if (i_sample > 60000) and (i_sample <=
                                           (60000 + update_interval)):
                    if (sum(avg_rm) / 30) <= -0.88:
                        print('breaked')
                        early_stopped = True
                        break

            agent.update_target(agent.target_ops)

            # reset rnn state (history knowledge) before every training step
            rnn_state_train = agent.train_initial_state

            # Sample training mini-batch from replay buffer
            if rnn_state_train is not None:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length)
            else:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch(batch_size)

            # Calculate TD target for batch. Use "old" fixed parameters if target network is available
            # to compute targets else use "old" parameters of value function estimate.
            # mb_next_obs = np.reshape(mb_next_obs, (-1, nd))
            mb_next_q_values, _ = agent.target_model.predict(
                mb_next_obs, rnn_state_train)
            mb_best_next_action = np.argmax(mb_next_q_values, axis=1)
            mb_td_target = [
                mb_rewards[j] +
                gamma * mb_next_q_values[j][mb_best_next_action[j]]
                for j in range(nbatch)
            ]

            # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
            loss = agent.train(mb_obs, mb_actions, mb_td_target,
                               rnn_state_train)
            i_train += 1

            # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates
            if test_interval > 0 and i_train > 0 and (i_train % test_interval
                                                      == 0):
                ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000)
                with open(result_path, "a") as csvfile:
                    writer = csv.writer(csvfile)
                    ep_return[0:0] = [i_sample, i_train]
                    writer.writerow(ep_return)

        if done:
            # Reset the model
            next_obs = env.reset()
            next_obs = normalize_obs(next_obs)

        epsilon *= epsilon_decay
        obs = next_obs
        rnn_state0 = next_rnn_state

    # Save final model when training is finished.
    if save_model:
        agent.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        logger.info('Save reward trajectory to %s' % rew_results_path)
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return early_stopped, i_sample