Exemple #1
0
def main():
    ''' Create the environment
    '''
    env = gym.make(ENV_NAME)

    # For tensorboard
    writer = tf.summary.FileWriter("./tensorboard")

    assert STATE_DIM == np.prod(np.array(env.observation_space.shape))
    assert ACTION_DIM == np.prod(np.array(env.action_space.shape))

    env.seed(0)
    np.random.seed(0)
    ''' Create the replay memory
    '''
    replay_memory = Memory(REPLAY_MEM_CAPACITY)

    # Tensorflow part starts here!
    tf.reset_default_graph()
    ''' Create placeholders 
    '''
    # Placeholders
    state_placeholder = tf.placeholder(dtype=tf.float32, \
                                       shape=[None, STATE_DIM],
                                       name='state_placeholder')
    action_placeholder = tf.placeholder(dtype=tf.float32, \
                                        shape=[None, ACTION_DIM],
                                        name='action_placeholder')
    reward_placeholder = tf.placeholder(dtype=tf.float32,
                                        shape=[None],
                                        name='reward_placeholder')
    next_state_placeholder = tf.placeholder(dtype=tf.float32,
                                            shape=[None, STATE_DIM],
                                            name='next_state_placeholder')
    is_not_terminal_placeholder = tf.placeholder(
        dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder')

    is_training_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=(),
                                             name='is_training_placeholder')
    ''' A counter to count the number of episodes
    '''
    episodes = tf.Variable(0.0, trainable=False, name='episodes')
    episode_incr_op = episodes.assign_add(1)
    ''' Create the actor network inside the actor scope and calculate actions
    '''
    with tf.variable_scope('actor'):
        actor = ActorNetwork(STATE_DIM,
                             ACTION_DIM,
                             HIDDEN_1_ACTOR,
                             HIDDEN_2_ACTOR,
                             HIDDEN_3_ACTOR,
                             trainable=True)
        unscaled_actions = actor.call(state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        actions = scale_actions(unscaled_actions, env.action_space.low,
                                env.action_space.high)
    ''' Create the target actor network inside target_actor scope and calculate 
    the target actions. Apply stop_gradient to the target actions so that 
    thier gradient is not computed at any point of time.
    '''
    with tf.variable_scope('target_actor', reuse=False):
        target_actor = ActorNetwork(STATE_DIM,
                                    ACTION_DIM,
                                    HIDDEN_1_ACTOR,
                                    HIDDEN_2_ACTOR,
                                    HIDDEN_3_ACTOR,
                                    trainable=True)

        unscaled_target_actions = target_actor.call(next_state_placeholder)
        ''' Scale the actions to fit within the bounds provided by the 
        environment
        '''
        target_actions_temp = scale_actions(unscaled_target_actions,
                                            env.action_space.low,
                                            env.action_space.low)
        target_actions = tf.stop_gradient(target_actions_temp)
    ''' Create the critic network inside the critic variable scope. Get the 
    Q-values of given actions and Q-values of actions suggested by the actor 
    network.
    '''
    with tf.variable_scope('critic'):
        critic = CriticNetwork(STATE_DIM,
                               ACTION_DIM,
                               HIDDEN_1_CRITIC,
                               HIDDEN_2_CRITIC,
                               HIDDEN_3_CRITIC,
                               trainable=True)

        q_values_of_given_actions = critic.call(state_placeholder,
                                                action_placeholder)
        q_values_of_suggested_actions = critic.call(state_placeholder, actions)
    ''' Create the target critic network inside the target_critic variable 
    scope. Calculate the target Q-values and apply stop_gradient to it.
    '''
    with tf.variable_scope('target_critic', reuse=False):
        target_critic = CriticNetwork(STATE_DIM,
                                      ACTION_DIM,
                                      HIDDEN_1_CRITIC,
                                      HIDDEN_2_CRITIC,
                                      HIDDEN_3_CRITIC,
                                      trainable=True)

        target_q_values_temp = target_critic.call(next_state_placeholder,
                                                  target_actions)
        target_q_values = tf.stop_gradient(target_q_values_temp)
    ''' Calculate 
    - trainable variables in actor (Weights of actor network), 
    - Weights of target actor network
    - trainable variables in critic (Weights of critic network),
    - Weights of target critic network
    '''
    actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                   scope='actor')

    target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='target_actor')

    critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope='critic')

    target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')
    ''' Get the operators for updating the target networks. The 
    update_target_networks function defined in utils returns a list of operators 
    to be run from tf session inorder to update the target networks using 
    soft update.
    '''
    update_targets_op = update_target_networks(TAU, \
        target_actor_vars, actor_vars, target_critic_vars, \
            critic_vars)
    ''' Create the tf operation to train the critic network:
    - calculate TD-target 
    - calculate TD-Error = TD-target - q_values_of_given_actions
    - calculate Critic network's loss (Mean Squared Error of TD-Errors)
    - ?
    - create a tf operation to train the critic network
    '''
    targets = tf.expand_dims(reward_placeholder, 1) + \
        tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \
            target_q_values
    td_errors = targets - q_values_of_given_actions
    critic_loss = tf.reduce_mean(tf.square(td_errors))

    # Update critic networks after computing loss
    for var in critic_vars:
        if not 'bias' in var.name:
            critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var)

    # optimize critic
    critic_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss)
    ''' Create a tf operation to train the actor networks
    - Calculate the Actor network's loss
    - Create the tf operation to train the actor network
    '''
    # Actor's loss
    actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions)
    for var in actor_vars:
        if not 'bias' in var.name:
            actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var)

    # Optimize actor
    actor_train_op = tf.train.AdamOptimizer(
        LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss,
                                                           var_list=actor_vars)

    # Init session
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    writer.add_graph(sess.graph)

    # Training
    num_steps = 0
    for episode in range(NUM_EPISODES):
        total_reward = 0
        num_steps_in_episode = 0

        # Create noise
        noise = np.zeros(ACTION_DIM)
        noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \
            (env.action_space.high - env.action_space.low)

        # Initial state
        state = env.reset()

        for _ in range(MAX_STEPS_PER_EPISODE):

            action = sess.run(actions, feed_dict={ \
                state_placeholder: state[None],
                is_training_placeholder: False})

            # Add Noise to actions
            noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \
                EXPLORATION_SIGMA * np.random.randn(ACTION_DIM)

            action += noise_scale * noise

            # Take action on env
            next_state, reward, done, _info = env.step(action)
            next_state = np.squeeze(next_state)
            reward = np.squeeze(reward)
            action = action[0]

            total_reward += reward

            replay_memory.add_to_memory(
                (state, action, reward, next_state, 0.0 if done else 1.0))

            if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \
                MINI_BATCH_SIZE :
                batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE)
                _, _ = sess.run([critic_train_op, actor_train_op],
                    feed_dict={
                        state_placeholder: np.asarray( \
                            [elem[0] for elem in batch]),
                        action_placeholder: np.asarray( \
                            [elem[1] for elem in batch]),
                        reward_placeholder: np.asarray( \
                            [elem[2] for elem in batch]),
                        next_state_placeholder: np.asarray( \
                            [elem[3] for elem in batch]),
                        is_not_terminal_placeholder: np.asarray( \
                            [elem[4] for elem in batch]),
                        is_training_placeholder: True
                })

                _ = sess.run(update_targets_op)

            state = next_state
            num_steps += 1
            num_steps_in_episode += 1

            if done:
                _ = sess.run(episode_incr_op)
                break

        print(str((episode, total_reward, num_steps_in_episode, noise_scale)))

    env.close()
Exemple #2
0
class DDPGAgent:
    def __init__(self, state_dim, action_dim, action_max):
        # load model if True
        self.load_model = False

        tf.reset_default_graph()
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=True)))

        # information of state and action
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_max = float(action_max)
        self.action_min = -float(action_max)

        # hyper parameters
        self.h_critic = 16
        self.h_actor = 16
        self.lr_critic = 1e-3
        self.lr_actor = 1e-4
        self.discount_factor = 0.99
        self.tau = 0.01  # soft target update rate

        self.state_ph = tf.placeholder(dtype=tf.float32,
                                       shape=[None, self.state_dim])
        self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None])
        self.next_state_ph = tf.placeholder(dtype=tf.float32,
                                            shape=[None, self.state_dim])
        self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None])

        with tf.variable_scope('actor'):
            self.action = self.generate_actor_network(self.state_ph, True)
        with tf.variable_scope('target_actor'):
            self.target_action = self.generate_actor_network(
                self.next_state_ph, False)
        with tf.variable_scope('critic'):
            self.qvalue = self.generate_critic_network(self.state_ph,
                                                       self.action, True)
        with tf.variable_scope('target_critic'):
            self.target_qvalue = self.generate_critic_network(
                self.next_state_ph, self.target_action, False)

        self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='actor')
        self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_actor')
        self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                          scope='critic')
        self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_critic')

        q_target = tf.expand_dims(
            self.reward_ph, 1) + self.discount_factor * self.target_qvalue * (
                1 - tf.expand_dims(self.done_ph, 1))
        td_errors = q_target - self.qvalue
        critic_loss = tf.reduce_mean(tf.square(td_errors))
        self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize(
            critic_loss, var_list=self.c_params)

        actor_loss = -tf.reduce_mean(self.qvalue)
        self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize(
            actor_loss, var_list=self.a_params)

        self.soft_target_update = [[
            tf.assign(ta, (1 - self.tau) * ta + self.tau * a),
            tf.assign(tc, (1 - self.tau) * tc + self.tau * c)
        ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params,
                                  self.tc_params)]

        # exploration
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0
        self.exploration_steps = 100000.
        self.epsilon_decay_step = (self.epsilon_start -
                                   self.epsilon_end) / self.exploration_steps
        self.noise = np.zeros(action_dim)

        self.minibatch_size = 32
        self.pre_train_step = 3
        self.replay_buffer = ReplayBuffer(buffer_size=1000000,
                                          minibatch_size=self.minibatch_size)

        self.mu = 0
        self.theta = 0.15
        self.sigma = 0.2

        # tensorboard setting
        self.avg_q_max, self.loss_sum = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg',
                                                    self.sess.graph)

        self.sess.run(tf.global_variables_initializer())

        self.save_file = "./save_model/tensorflow_ddpg-1"
        self.load_file = "./save_model/tensorflow_ddpg-1"
        self.saver = tf.train.Saver()
        if self.load_model:
            self.saver.restore(self.sess, self.load_file)

    def choose_action(self, state):
        return self.sess.run(self.action,
                             feed_dict={self.state_ph: state[None]})[0]

    def train_network(self, state, action, reward, next_state, done, step):
        self.sess.run(self.train_critic,
                      feed_dict={
                          self.state_ph: state,
                          self.action: action,
                          self.reward_ph: reward,
                          self.next_state_ph: next_state,
                          self.done_ph: done
                      })
        self.sess.run(self.train_actor, feed_dict={self.state_ph: state})
        self.sess.run(self.soft_target_update)

    def generate_critic_network(self, state, action, trainable):

        hidden1 = tf.layers.dense(tf.concat([state, action], axis=1),
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden2 = tf.layers.dense(hidden1,
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden3 = tf.layers.dense(hidden2,
                                  self.h_critic,
                                  activation=tf.nn.relu,
                                  trainable=trainable)

        qvalue = tf.layers.dense(hidden3, 1, trainable=trainable)

        return qvalue

    def generate_actor_network(self, state, trainable):
        hidden1 = tf.layers.dense(state,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden2 = tf.layers.dense(hidden1,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)
        hidden3 = tf.layers.dense(hidden2,
                                  self.h_actor,
                                  activation=tf.nn.relu,
                                  trainable=trainable)

        non_scaled_action = tf.layers.dense(hidden3,
                                            self.action_dim,
                                            activation=tf.nn.sigmoid,
                                            trainable=trainable)
        action = non_scaled_action * (self.action_max -
                                      self.action_min) + self.action_min

        return action

    def get_action(self, obs):
        # 최적의 액션 선택 + Exploration (Epsilon greedy)

        action = self.choose_action(obs)
        self.printConsole("origianl action: " + str(action))

        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step

        self.printConsole("noise scale: " + str(self.epsilon))
        self.noise = self.ou_noise(self.noise)
        self.printConsole("             noise: " +
                          str(self.noise *
                              (self.action_max - self.action_min) / 2 *
                              max(self.epsilon, 0)))
        action = action + self.noise * (
            self.action_max - self.action_min) / 2 * max(self.epsilon, 0)
        action = np.maximum(action, self.action_min)
        action = np.minimum(action, self.action_max)

        return action

    def train_agent(self, obs, action, reward, obs_next, done, step):

        self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done))

        if len(self.replay_buffer.replay_memory
               ) < self.minibatch_size * self.pre_train_step:
            return None

        minibatch = self.replay_buffer.sample_from_memory()
        s, a, r, ns, d = map(np.array, zip(*minibatch))

        self.train_network(s, a, r, ns, d, step)
        return None

    # make summary operators for tensorboard
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)
        episode_total_score = tf.Variable(0.)

        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
        tf.summary.scalar('Total Score/Episode', episode_total_score)

        summary_vars = [
            episode_total_reward, episode_avg_max_q, episode_avg_loss,
            episode_total_score
        ]
        summary_placeholders = [
            tf.placeholder(tf.float32) for _ in range(len(summary_vars))
        ]
        update_ops = [
            summary_vars[i].assign(summary_placeholders[i])
            for i in range(len(summary_vars))
        ]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def ou_noise(self, x):
        return x + self.theta * (self.mu - x) + self.sigma * np.random.randn(
            self.action_dim)

    def printConsole(self, message):
        print(message)
        sys.__stdout__.flush()