Exemple #1
0
def main(config):
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.random.set_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape

    # Use action_dim[0]: (a_dim,) --> a_dim
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_low = env.action_space.low
    action_high = env.action_space.high

    print(f'-------- {env_name} --------')
    print('STATE DIM: ', state_dim)
    print('ACTION DIM: ', action_dim)
    print('ACTION LOW: ', action_low)
    print('ACTION HIGH: ', action_high)
    print('----------------------------')

    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'],
                                 config['random_seed'])

    # Take a random action in the environment to initialize networks
    env.reset()
    _, initial_reward, _, _ = env.step(env.action_space.sample())

    # Use agent_factory to build the agent using the algorithm specified in the config file
    Agent = agent_factory(config['agent']['model'])
    agent = Agent(config, state_dim, action_dim, action_low, action_high,
                  initial_reward)

    for episode in range(int(config['train']['max_episodes'])):
        s = env.reset()
        s = s / 255.0

        episode_reward = 0
        episode_average_max_q = 0

        for step in range(int(config['train']['max_episode_len'])):
            if config['run']['render_env'] == True:
                env.render()

            # 1. Use current behavioural policy network to predict an action to take
            # TODO: the [0] works for new SAC. Check again with DDPG updates.
            a = agent.actor.model.predict(np.expand_dims(s, 0))[0]

            # print('ACTION: ', a)
            # print('a[0]: ', a[0])

            # 2. Use action to take step in environment and receive next step, reward, etc.
            s2, r, terminal, info = env.step(a[0])
            s2 = s2 / 255.0

            # 3. Update the replay buffer with the most recent experience
            replay_buffer.add(np.reshape(s, state_dim),
                              np.reshape(a, action_dim), r,
                              np.reshape(s2, state_dim), terminal)

            # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
            if replay_buffer.size() > batch_size:
                experience = replay_buffer.sample_batch(batch_size)

                # Train current behavioural networks
                # predicted_Q_value = agent.train_networks(experience)
                loss_actor, criticQ, criticV = agent.train_networks(experience)

                # Update for logging
                # episode_average_max_q += np.amax(predicted_Q_value)

                # Soft update of frozen target networks
                agent.update_target_networks()

            # Update information for next step
            s = s2
            episode_reward += r

            if terminal:
                print(
                    f'Epoch {epoch} training losses: ACTOR: {loss_actor} | CRITIC_Q: {criticQ} | CRITIC_V: {criticV}'
                )
                # print(f'| Reward: {int(episode_reward)} | Episode: {episode} | Qmax: {episode_average_max_q / float(step)}')
                break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()
Exemple #2
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        self.critic = DDPGValueNet(feature_shape=self.env.features_shape,
                                   a_num=self.env.num_actions,
                                   lr=params.lr_c)
        self.target_critic = DDPGValueNet(
            feature_shape=self.env.features_shape,
            a_num=self.env.num_actions,
            lr=params.lr_c)
        self._copy_para(self.critic.model, self.target_critic.model)

        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)
        self._copy_para(self.actor, self.target_actor)

        self.ema = tf.train.ExponentialMovingAverage(decay=1.0 -
                                                     self.parms.tau)

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _ema_update(self):

        paras = self.actor.trainable_weights + \
                self.critic.model.trainable_weights

        self.ema.apply(paras)

        for i, j in zip(self.target_actor.trainable_weights + \
            self.target_critic.model.trainable_weights, paras):
            i.assign(self.ema.average(j))

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Train critic
        with tf.GradientTape() as tape:
            pi_next = self.target_actor(s_next)
            a_next = pi_next.sample()
            q_next = self.target_critic([s_next, a_next])
            y = r + self.parms.gamma * q_next * not_done
            q = self.critic([s, a])
            c_loss = tf.losses.mean_squared_error(y, q)
        c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights)
        self.critic.model.optimizer.apply_gradients(
            zip(c_grads, self.critic.model.trainable_weights))

        # Train actor
        with tf.GradientTape() as tape:
            pi = self.actor(s)
            a = pi.sample()
            q = self.critic([s, a])
            a_loss = -tf.reduce_mean(q)
        a_grads = tape.gradient(a_loss, self.actor.trainable_weights)
        self.actor.optimizer.apply_gradients(
            zip(a_grads, self.actor.trainable_weights))

        self._ema_update()

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1
            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
Exemple #3
0
def q_learning(sess,
               env,
               agent,
               num_episodes,
               max_time_per_episode,
               discount_factor=0.99,
               epsilon=0.4,
               epsilon_decay=.95,
               use_experience_replay=False,
               max_replay_buffer_size=4000,
               batch_size=128,
               target=None,
               tf_saver=None,
               save_path=None,
               save_interval=None):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.
    Implements the options of online learning or using experience replay and also
    target calculation by target networks, depending on the flags. You can reuse
    your Q-learning implementation of the last exercise.

    Args:
        env: PLE game
        approx: Action-Value function estimator
        num_episodes: Number of episodes to run for.
        max_time_per_episode: maximum number of time steps before episode is terminated
        discount_factor: gamma, discount factor of future rewards.
        epsilon: Chance to sample a random action. Float betwen 0 and 1.
        epsilon_decay: decay rate of epsilon parameter
        use_experience_replay: Indicator if experience replay should be used.
        batch_size: Number of samples per batch.
        target: Slowly updated target network to calculate the targets. Ignored if None.

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    # Keeps track of useful statistics
    stats = EpisodeStats(episode_lengths=np.zeros(num_episodes),
                         episode_rewards=np.zeros(num_episodes))

    replay_buffer = ReplayBuffer(max_replay_buffer_size)
    action_set = env.getActionSet()

    for i_episode in range(num_episodes):

        # The policy we're following
        policy = make_epsilon_greedy_policy(agent.predict, len(action_set))

        # Print out which episode we're on, useful for debugging.
        # Also print reward for last episode
        last_reward = stats.episode_rewards[i_episode - 1]
        avg_reward = np.mean(stats.episode_rewards[max(i_episode -
                                                       100, 0):i_episode])
        print("\rEpisode {}/{} ({}), avg reward: {}".format(
            i_episode + 1, num_episodes, last_reward, avg_reward),
              end="")
        # sys.stdout.flush()

        # Reset the current environment
        env.reset_game()
        state = list(env.getGameState())
        done = False
        loss = None

        # Iterate through steps
        for t in range(max_time_per_episode):
            if env.game_over():
                done = True

            # Update target network maybe
            if target:
                pass

            # Take a step
            action_probs = policy([state], epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            reward = env.act(action_set[action])
            next_state = list(env.getGameState())

            # episode stats
            stats.episode_lengths[i_episode] = t
            # print(reward)
            stats.episode_rewards[i_episode] += reward

            if done:
                print("\rStep {} ({}) loss: {}\n".format(
                    t, max_time_per_episode, loss),
                      end="")
                break

            if use_experience_replay:
                # Update replay buffer
                replay_buffer.add_transition(state, action, next_state, reward,
                                             done)

                # Sample minibatch from replay buffer
                batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \
                    replay_buffer.next_batch(min(batch_size, replay_buffer.size()))

                batch_actions = list(
                    zip(range(len(batch_actions)), batch_actions))

                # Calculate TD target for batch. Use "old" fixed parameters if target network is available
                # to compute targets else use "old" parameters of value function estimate.
                batch_next_q_values = (target if target else
                                       agent.train_model).predict(
                                           batch_next_states, None, None)
                batch_best_next_action = np.argmax(batch_next_q_values, axis=1)
                batch_td_target = [
                    batch_rewards[j] + discount_factor *
                    batch_next_q_values[j][batch_best_next_action[j]]
                    for j in range(len(batch_states))
                ]

                # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
                loss = agent.train(batch_states, batch_actions,
                                   batch_td_target)
            else:
                next_q_values = (target if target else agent).predict(
                    [next_state], None, None)
                best_next_action = np.argmax(next_q_values, axis=1)
                td_target = reward + (discount_factor * next_q_values[0] *
                                      best_next_action)
                loss = agent.train([state], [[0, action]], td_target)

            if target:
                target.update()

            epsilon *= epsilon_decay
            state = next_state

        if i_episode % save_interval == 0:
            tf_saver.save(sess, save_path, global_step=i_episode)

    return stats
Exemple #4
0
def q_learning(q_network,
               env,
               test_env,
               seed,
               total_timesteps,
               log_interval,
               test_interval,
               show_interval,
               logdir,
               lr,
               max_grad_norm,
               units_per_hlayer,
               activ_fcn,
               gamma=0.95,
               epsilon=0.4,
               epsilon_decay=.95,
               buffer_size=4000,
               batch_size=128,
               trace_length=32,
               tau=0.99,
               update_interval=30,
               early_stop=False,
               keep_model=2,
               save_model=True,
               restore_model=False,
               save_traj=False):
    # """
    # Q-Learning algorithm for off-policy TD control using Function Approximation.
    # Finds the optimal greedy policy while following an epsilon-greedy policy.
    # Implements the options of online learning or using experience replay and also
    # target calculation by target networks, depending on the flags. You can reuse
    # your Q-learning implementation of the last exercise.
    #
    # Args:
    #     env: PLE game
    #     approx: Action-Value function estimator
    #     num_episodes: Number of episodes to run for.
    #     max_time_per_episode: maximum number of time steps before episode is terminated
    #     discount_factor: gamma, discount factor of future rewards.
    #     epsilon: Chance to sample a random action. Float betwen 0 and 1.
    #     epsilon_decay: decay rate of epsilon parameter
    #     use_experience_replay: Indicator if experience replay should be used.
    #     batch_size: Number of samples per batch.
    #     target: Slowly updated target network to calculate the targets. Ignored if None.
    #
    # Returns:
    #     An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    # """
    logger = logging.getLogger(__name__)
    # logger.info(datetime.time)

    tf.reset_default_graph()
    set_global_seeds(seed)

    # Params
    ob_space = env.observation_space
    ac_space = env.action_space
    nd, = ob_space.shape
    n_ac = ac_space.n

    # Create learning agent and the replay buffer
    agent = DQNAgent(q_network=q_network,
                     ob_space=ob_space,
                     ac_space=ac_space,
                     lr=lr,
                     max_grad_norm=max_grad_norm,
                     units_per_hlayer=units_per_hlayer,
                     activ_fcn=activ_fcn,
                     log_interval=log_interval,
                     logdir=logdir,
                     batch_size=batch_size,
                     trace_length=trace_length,
                     update_interval=update_interval,
                     tau=tau,
                     keep_model=keep_model)
    summary_writer = agent.get_summary_writer()
    result_path = os.path.join(logdir, 'train_results.csv')
    if save_traj:
        rew_traj = []
        rew_results_path = os.path.join(
            logdir, ('lr' + str(lr) + '_tracking_results.csv'))
    else:
        rew_results_path = None
    replay_buffer = ReplayBuffer(buffer_size)

    # Keeps track of useful statistics
    stats = EpisodeStats

    if restore_model:
        for el in os.listdir(logdir):
            if 'final' in el and '.meta' in el:
                # Load pre trained model and set network parameters
                logger.info('load %s' % os.path.join(logdir, el[:-5]))
                agent.load(os.path.join(logdir, el[:-5]))
                # Reset global step parameter.
                agent.sess.run(agent.global_step.assign(0))

    # ------------------ TRAINING --------------------------------------------
    logger.info("Start Training")
    early_stopped = False
    i_episode, i_sample, i_train = 0, 0, 0
    len, rew = 0, 0
    horizon = 100
    reward_window = deque(maxlen=horizon)
    avg_rm = deque(maxlen=30)
    nbatch = batch_size * trace_length
    return_threshold = -0.05  # 40

    # Reset envnn
    obs = env.reset()
    obs = normalize_obs(obs)
    done = False
    rnn_state0 = agent.step_initial_state
    if rnn_state0 is None:  # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences.
        trace_length = 1

    # Set the target network to be equal to the primary network
    agent.update_target(agent.target_ops)
    while i_sample < total_timesteps:
        if np.random.rand(1) < epsilon:
            _, next_rnn_state = agent.step([obs],
                                           rnn_state0)  # epsilon greedy action
            action = np.random.randint(0, n_ac)
        else:
            AP, next_rnn_state = agent.step(
                [obs], rnn_state0)  # epsilon greedy action
            action = AP[0]
        next_obs, reward, done, _ = env.step(action)
        next_obs = normalize_obs(next_obs)
        i_sample += 1
        # render only every i-th episode
        if show_interval != 0:
            if i_episode % show_interval == 0:
                env.render()

        len += 1
        rew += reward
        reward_window.append(reward)

        # When episode is done, add episode information to tensorboard summary and stats
        if done:  # env.game_over():
            next_obs = list(np.zeros_like(next_obs, dtype=np.float32))

            stats['episode_lengths'].append(len)
            stats['episode_rewards'].append(rew)

            if summary_writer is not None:
                summary = tf.Summary()
                summary.value.add(
                    tag='envs/ep_return',
                    simple_value=stats['episode_rewards'][i_episode])
                summary.value.add(
                    tag="envs/ep_length",
                    simple_value=stats['episode_lengths'][i_episode])
                summary_writer.add_summary(summary, i_episode)
                summary_writer.flush()

            if save_model and rew > return_threshold:
                return_threshold = rew
                logger.info('Save model at max reward %s' % return_threshold)
                agent.save('inter_model')

            i_episode += 1
            len, rew = 0, 0

        # Update replay buffer
        replay_buffer.add_transition(obs, action, next_obs, reward, done)
        if save_traj:
            rew_traj.append(reward)

        # Update model parameters every #update_interval steps. Use real experience and replayed experience.
        if replay_buffer.size() > nbatch and (i_sample % update_interval == 0):
            if (env.spec._env_name == 'ContFlappyBird'):
                rm = sum(reward_window) / horizon
                if summary_writer is not None:
                    s_summary = tf.Summary()
                    s_summary.value.add(tag='envs/isample_return',
                                        simple_value=rm)
                    summary_writer.add_summary(s_summary, i_sample)
                    summary_writer.flush()
                if save_model and rm > return_threshold:
                    return_threshold = rm
                    logger.info('Save model at max rolling mean %s' %
                                return_threshold)
                    agent.save('inter_model')
                avg_rm.append(rm)

            if early_stop:
                if (i_sample > 60000) and (i_sample <=
                                           (60000 + update_interval)):
                    if (sum(avg_rm) / 30) <= -0.88:
                        print('breaked')
                        early_stopped = True
                        break

            agent.update_target(agent.target_ops)

            # reset rnn state (history knowledge) before every training step
            rnn_state_train = agent.train_initial_state

            # Sample training mini-batch from replay buffer
            if rnn_state_train is not None:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length)
            else:
                mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \
                                                replay_buffer.recent_and_next_batch(batch_size)

            # Calculate TD target for batch. Use "old" fixed parameters if target network is available
            # to compute targets else use "old" parameters of value function estimate.
            # mb_next_obs = np.reshape(mb_next_obs, (-1, nd))
            mb_next_q_values, _ = agent.target_model.predict(
                mb_next_obs, rnn_state_train)
            mb_best_next_action = np.argmax(mb_next_q_values, axis=1)
            mb_td_target = [
                mb_rewards[j] +
                gamma * mb_next_q_values[j][mb_best_next_action[j]]
                for j in range(nbatch)
            ]

            # Update Q value estimator parameters by optimizing between Q network and Q-learning targets
            loss = agent.train(mb_obs, mb_actions, mb_td_target,
                               rnn_state_train)
            i_train += 1

            # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates
            if test_interval > 0 and i_train > 0 and (i_train % test_interval
                                                      == 0):
                ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000)
                with open(result_path, "a") as csvfile:
                    writer = csv.writer(csvfile)
                    ep_return[0:0] = [i_sample, i_train]
                    writer.writerow(ep_return)

        if done:
            # Reset the model
            next_obs = env.reset()
            next_obs = normalize_obs(next_obs)

        epsilon *= epsilon_decay
        obs = next_obs
        rnn_state0 = next_rnn_state

    # Save final model when training is finished.
    if save_model:
        agent.save('final_model')
        logger.info('Finished Training. Saving Final model.')

    if rew_results_path is not None:
        logger.info('Save reward trajectory to %s' % rew_results_path)
        with open(rew_results_path, "a") as csvfile:
            writer = csv.writer(csvfile)
            traj = np.asanyarray(rew_traj).reshape(-1).tolist()
            traj[0:0] = [np.mean(traj)]  # i_train, i_sample
            writer.writerow(traj)

    logger.info('*******************************************************')
    logger.info('Total number of interactions with the environment: %s' %
                i_sample)
    logger.info('Total number of parameter updates during training: %s' %
                i_train)
    logger.info('*******************************************************\n')

    return early_stopped, i_sample
Exemple #5
0
class Trainer():
    def __init__(self, params: Parameters):
        self.parms = params

        self.env = Env(params.game,
                       params.gamma,
                       norm_rewards=None,
                       norm_states=False)

        self.buffer = ReplayBuffer(params.replay_size)

        # Seed
        self.env.seed(params.seed)
        np.random.seed(params.seed)
        tf.random.set_seed(params.seed)

        # Four critic nets
        critic_nets = [
            DDPGValueNet(feature_shape=self.env.features_shape,
                         a_num=self.env.num_actions,
                         lr=params.lr_c) for _ in range(4)
        ]
        self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets

        # Two actor nets
        self.actor = CtsPolicy(action_bound=self.env.action_bound,
                               action_dim=self.env.num_actions,
                               lr=params.lr_a)
        self.target_actor = CtsPolicy(action_bound=self.env.action_bound,
                                      action_dim=self.env.num_actions,
                                      lr=params.lr_a)

        # Copy parms
        self._copy_para(self.critic1, self.target_critic1)
        self._copy_para(self.critic2, self.target_critic2)
        self._copy_para(self.actor, self.target_actor)

        self.train_step_cnt = 0

    def _copy_para(self, from_model, to_model):
        """
        Copy parameters for soft updating
        :param from_model: latest model
        :param to_model: target model
        :return: None
        """
        for i, j in zip(from_model.trainable_weights,
                        to_model.trainable_weights):
            j.assign(i)

    def _target_soft_update(self, net, target_net):
        """ soft update the target net with Polyak averaging """
        for target_param, param in zip(target_net.trainable_weights,
                                       net.trainable_weights):
            target_param.assign(  # copy weight value into target parameters
                target_param * (1.0 - self.parms.tau) + param * self.parms.tau)

    def _train(self):

        # Sample
        batch = self.buffer.sample(self.parms.batch_size)
        s = np.array([batch_[0] for batch_ in batch])
        a = np.array([batch_[1] for batch_ in batch])
        r = np.array([batch_[2] for batch_ in batch])
        s_next = np.array([batch_[3] for batch_ in batch])
        not_done = np.array([not batch_[4] for batch_ in batch])

        # Reshpe
        r = r[:, np.newaxis]
        not_done = not_done[:, np.newaxis]

        # Set target y
        pi_next = self.target_actor(s_next)
        a_next = pi_next.sample()
        q_next = tf.minimum(self.target_critic1([s_next, a_next]),
                            self.target_critic2([s_next, a_next]))
        y = r + self.parms.gamma * q_next * not_done

        # Train critic1
        with tf.GradientTape() as c1_tape:
            q1 = self.critic1([s, a])
            c1_loss = tf.losses.mean_squared_error(y, q1)
        c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights)
        self.critic1.optimizer.apply_gradients(
            zip(c1_grads, self.critic1.trainable_weights))

        # Train critic2
        with tf.GradientTape() as c2_tape:
            q2 = self.critic2([s, a])
            c2_loss = tf.losses.mean_squared_error(y, q2)
        c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights)
        self.critic2.optimizer.apply_gradients(
            zip(c2_grads, self.critic2.trainable_weights))

        # Train actor
        if self.train_step_cnt % self.parms.actor_interval == 0:

            with tf.GradientTape() as a_tape:
                pi = self.actor(s)
                a = pi.sample()
                q = self.critic1([s, a])
                a_loss = -tf.reduce_mean(q)
            a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights)
            self.actor.optimizer.apply_gradients(
                zip(a_grads, self.actor.trainable_weights))

            # update parms
            self._target_soft_update(self.actor, self.target_actor)
            self._target_soft_update(self.critic1, self.target_critic1)
            self._target_soft_update(self.critic2, self.target_critic2)

    def train_step(self):

        # Episode infomation
        episode_ret = []

        # Initialize s
        s = self.env.reset()
        for _ in range(self.parms.train_step_len):
            # Interact
            pi = self.actor(s[np.newaxis, :])  # batch_size=1

            a = pi.sample()[0]
            s_next, r, done, info = self.env.step(a)

            # Store
            self.buffer.store((s, a, r, s_next, done))

            # Train
            if self.buffer.size() > self.parms.start_size:
                self._train()
                self.train_step_cnt += 1

            if done:
                _, ret = info['done']
                episode_ret.append(ret)
                s_next = self.env.reset()

            s = s_next

        return np.mean(episode_ret)
Exemple #6
0
class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.state_size = env.observation_space.shape
        self.action_size = env.action_space.n
        self.replay_buffer = ReplayBuffer(buffer_size=1000000, batch_size=32)
        self.target_update_frequency = 16
        self.target_update_counter = 0
        self.gamma = 0.95
        self.initial_epsilon = 1
        self.epsilon = self.initial_epsilon
        self.epsilon_decay_rate = 0.99995
        self.min_epsilon = 0.01
        self.rho = 0.95
        self.learning_rate = 0.00025
        self.training_scores = []

        # main model  # gets trained every step
        self.model = self.build_model()

        # Target model this is what we .predict against every step
        self.target_model = self.build_model()
        self.target_model.set_weights(self.model.get_weights())

    def build_model(self):
        # Neural Network architecture for Deep-Q learning Model
        model = Sequential()
        model.add(
            Conv2D(filters=32,
                   kernel_size=8,
                   strides=4,
                   activation='relu',
                   input_shape=self.state_size))
        model.add(
            Conv2D(filters=32, kernel_size=4, strides=2, activation='relu'))
        model.add(
            Conv2D(filters=32, kernel_size=3, strides=1, activation='relu'))

        model.add(Flatten())
        model.add(
            Dense(256,
                  activation='relu',
                  kernel_regularizer=regularizers.l2(0.001)))
        model.add(
            Dense(128,
                  activation='relu',
                  kernel_regularizer=regularizers.l2(0.001)))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss=utils.huber_loss_mean,
                      optimizer=RMSprop(lr=self.learning_rate,
                                        rho=self.rho,
                                        epsilon=self.min_epsilon),
                      metrics=["accuracy"])
        model.summary()
        return model

    def reset_episode(self, initial_state):
        """Reset variables for a new episode."""
        # Gradually decrease exploration rate
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.epsilon, self.min_epsilon)

        self.prev_state = self.preprocess_state(initial_state)
        self.prev_action = np.argmax(self.model.predict(self.prev_state))
        return self.prev_action

    def preprocess_state(self, state):
        # Preprocessing code
        return np.expand_dims(np.array(state), axis=0)

    def reset_exploration(self, epsilon=None):
        """Reset exploration rate used when training."""
        self.epsilon = epsilon if epsilon is not None else self.initial_epsilon

    def plot_scores(self, scores, rolling_window=100):
        """Plot scores and optional rolling mean using specified window."""
        plt.title("Scores")
        plt.xlabel("Episodes -->")
        plt.ylabel("Scores -->")
        plt.plot(scores)
        rolling_mean = pd.Series(scores).rolling(rolling_window).mean()
        plt.plot(rolling_mean)

    def act(self, next_state, reward, done, mode="train", time_delay=None):
        """Pick next action and update weights of the neural network (when mode != 'test')."""
        next_state = self.preprocess_state(next_state)
        if mode == "test":
            # Test mode: Simply produce an action
            action = np.argmax(self.model.predict(next_state))
            if time_delay != None:  # Adding time delay to watch the agent perform at a little slower pace.
                time.sleep(time_delay)
        else:
            # Exploration vs. exploitation
            do_exploration = np.random.uniform(0, 1) < self.epsilon
            if do_exploration:
                # Pick a random action
                action = np.random.randint(0, self.action_size)
            else:
                # Pick the best action from Q table
                action = np.argmax(self.model.predict(next_state))

            # Store the experience in replay memory
            self.replay_buffer.add(self.prev_state, self.prev_action, reward,
                                   next_state, done)

            # Learn
            self.replay(done)

        # Roll over current state, action for next step
        self.prev_state = next_state
        self.prev_action = action
        return action

    def replay(self, done):
        if self.replay_buffer.size() < self.replay_buffer.batch_size:
            return

        terminal_state = done  # Determine if the episode has ended.
        minibatch = self.replay_buffer.sample()

        # X : states, y : predictions
        X = []
        y = []

        prev_states = np.array([transition[0][0] for transition in minibatch])
        prev_qs = self.model.predict(prev_states)

        next_states = np.array([transition[3][0] for transition in minibatch])
        next_qs = self.target_model.predict(next_states)

        for index, (prev_state, prev_action, reward, next_state,
                    done) in enumerate(minibatch):
            # Setting the target for the model to improve upon
            if not done:
                target = reward + (self.gamma * np.max(next_qs[index]))
            else:
                target = reward

            new_q_value = prev_qs[index]
            new_q_value[prev_action] = target

            X.append(prev_state)
            y.append(new_q_value)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.vstack(X),
                       np.vstack(y),
                       batch_size=self.replay_buffer.batch_size,
                       verbose=0,
                       shuffle=False)

        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > self.target_update_frequency:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    def run(self,
            num_episodes=20000,
            mode="train",
            time_delay=0.01,
            score_threshold=None,
            weights_path=None,
            scores_path=None):
        """Run agent in given reinforcement learning environment and return scores."""
        scores = []
        max_score = -np.inf
        min_score = np.inf
        max_avg_score = -np.inf
        avg_score = -np.inf
        for i_episode in range(1, num_episodes + 1):
            # Initialize episode
            state = self.env.reset()
            action = self.reset_episode(state)
            total_reward = 0
            done = False

            # Roll out steps until done
            while not done:
                next_state, reward, done, info = self.env.step(action)
                total_reward += reward
                action = self.act(next_state, reward, done, mode, time_delay)
                self.env.render()
            # Save final score
            scores.append(total_reward)
            # Print episode stats
            if mode == 'train':
                self.training_done = True

                if total_reward > max_score:
                    max_score = total_reward

                if total_reward < min_score:
                    min_score = total_reward

                if len(scores) > 100:
                    avg_score = np.mean(scores[-100:])
                    if avg_score > max_avg_score:
                        max_avg_score = avg_score

                if weights_path != None and i_episode % 100 == 0:
                    self.model.save_weights(weights_path)
                    if scores_path != None:
                        logs = {"scores": scores}
                        logs = pd.DataFrame.from_dict(data=logs,
                                                      orient='index')
                        logs.to_csv(scores_path, index=False)

                print(
                    "\rEpisode {}/{} | Episode Score: {} | Min. Score: {} | Max. Score: {} | Current Avg. Score: {} | Max. Average Score: {} | epsilon: {}"
                    .format(i_episode, num_episodes, total_reward, min_score,
                            max_score, avg_score, max_avg_score, self.epsilon),
                    end="")
                sys.stdout.flush()

            # Terminating loop if the agent achieves reward threshold
            if score_threshold != None and max_avg_score > score_threshold:
                print(
                    "\nEnvironment solved after {} episodes".format(i_episode))
                break

        # Close rendering
        self.env.close()

        if mode == "test":
            print("\nScore: ", np.mean(scores))
        else:
            self.training_scores.append(scores)
Exemple #7
0
def main(config):
    tf.compat.v1.reset_default_graph()
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.compat.v1.set_random_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape

    # Use action_dim[0]: (a_dim,) --> a_dim
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_low = env.action_space.low
    action_high = env.action_space.high

    print(f'-------- {env_name} --------')
    print('STATE DIM: ', state_dim)
    print('ACTION DIM: ', action_dim)
    print('ACTION LOW: ', action_low)
    print('ACTION HIGH: ', action_high)
    print('----------------------------')
    
    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed'])
        
    # Set up summary TF operations
    summary_ops, summary_vars = build_summaries()

    with tf.compat.v1.Session() as sess:
        # sess.run(tf.compat.v1.global_variables_initializer())
        writer = tf.compat.v1.summary.FileWriter(config['output']['summary_dir'], sess.graph)

        # Use agent_factory to build the agent using the algorithm specified in the config file.
        Agent = agent_factory(config['agent']['model'])
        agent = Agent(config, state_dim, action_dim, action_low, action_high, sess)

        sess.run(tf.compat.v1.global_variables_initializer())

        for i in range(int(config['train']['max_episodes'])):
            s = env.reset()
            episode_reward = 0
            episode_average_max_q = 0

            for j in range(int(config['train']['max_episode_len'])):
                if config['run']['render_env'] == True:
                    env.render()

                # 1. Predict an action to take
                a = agent.actor.predict_action(np.expand_dims(s, 0))

                # 2. Use action to take step in environment and receive next step, reward, etc.
                s2, r, terminal, info = env.step(a[0])

                # 3. Update the replay buffer with the most recent experience
                replay_buffer.add(np.reshape(s, state_dim), np.reshape(a, action_dim), r,
                                  np.reshape(s2, state_dim), terminal)

                # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
                if replay_buffer.size() > batch_size:
                    experience = replay_buffer.sample_batch(batch_size)

                    # Train current behavioural networks
                    predicted_Q_value = agent.train_networks(experience)

                    # Update for logging
                    episode_average_max_q += np.amax(predicted_Q_value)

                    # Update target networks
                    agent.update_target_networks()

                # Update information for next step
                s = s2
                episode_reward += r

                if terminal:
                    summary_str = sess.run(summary_ops, feed_dict={
                        summary_vars[0]: episode_reward,
                        summary_vars[1]: episode_average_max_q / float(j)
                        })

                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j))))
                    
                    break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()
Exemple #8
0
def main(config):
    env_name = config['run']['env']
    env = gym.make(env_name)
    np.random.seed(config['random_seed'])
    tf.set_random_seed(config['random_seed'])
    env.seed(config['random_seed'])

    batch_size = config['train']['batch_size']
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    # Define action boundaries for continuous but bounded action space
    action_bound = env.action_space.high

    print(f'-------- {env_name} --------')
    print('ACTION SPACE: ', action_dim)
    print('ACTION BOUND: ', action_bound)
    print('STATE SPACE: ', state_dim)
    print(f'------------------------')


    # TODO (20190831, JP): add normalization for envs that require it.
    # Ensure action bound is symmetric - important
    assert (env.action_space.high == -env.action_space.low)

    # Use agent_factory to build the agent using the algorithm specified in the config file.
    Agent = agent_factory(config['agent']['model'])
    agent = Agent(config, state_dim, action_dim, action_bound)

    # Initialize memory for experience replay
    replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed'])

    print(replay_buffer)
        
    # Set up summary TF operations
    summary_ops, summary_vars = build_summaries()

    with tf.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        writer = tf.summary.FileWriter(config['output']['summary_dir'], sess.graph)

        # Initialize target network weights
        agent.update_target_networks(sess)

        for i in range(int(config['train']['max_episodes'])):
            s = env.reset()

            episode_reward = 0
            episode_average_max_q = 0

            for j in range(int(config['train']['max_episode_len'])):
                if config['run']['render_env'] == True:
                    env.render()

                # 1. Predict an action to take
                a = agent.actor_predict_action(np.reshape(s, (1, state_dim)), sess)

                # 2. Use action to take step in environment and receive next step, reward, etc.
                s2, r, terminal, info = env.step(a[0])

                # 3. Update the replay buffer with the most recent experience
                replay_buffer.add(np.reshape(s, (state_dim,)), np.reshape(a, (action_dim,)), r,
                                  np.reshape(s2, (state_dim,)), terminal)

                # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences
                if replay_buffer.size() > batch_size:
                    s_batch, a_batch, r_batch, s2_batch, t_batch = replay_buffer.sample_batch(batch_size)

                    # Train current behavioural networks
                    predicted_Q_value = agent.train_networks(s_batch, a_batch, r_batch, s2_batch, t_batch, sess)

                    # Update for logging
                    episode_average_max_q += np.amax(predicted_Q_value)

                    # Update target networks
                    agent.update_target_networks(sess)

                # Update information for next step
                s = s2
                episode_reward += r

                # TODO (20190815, JP): as this could be different for each agent, do
                # agent.summarize_episode(summary_ops, summary_vars, episode_reward, sess) for when each agent requires own summaries?
                if terminal:
                    summary_str = sess.run(summary_ops, feed_dict={
                        summary_vars[0]: episode_reward,
                        summary_vars[1]: episode_average_max_q / float(j)
                        })

                    writer.add_summary(summary_str, i)
                    writer.flush()

                    print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j))))
                    
                    break

    if config['run']['use_gym_monitor'] == True:
        env.monitor.close()