Example #1
0
class NAF(object):
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 learning_rate=1e-3,
                 tau=0.001,
                 hidden_layers=[200, 200]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        self.network = {}
        self.network['x'], self.network['u'], self.network['is_train'], self.network['V'], self.network['P'], \
            self.network['M'], self.network['Q'], self.network['variables'] = self.create_networks(is_target=False)

        self.target = {}
        self.target['x'], self.target['u'], _, self.target['V'], self.target['P'], \
            self.target['M'], self.target['Q'], self.target['variables'] = self.create_networks(is_target=True)

        #define optimization operations
        self.network_optimization = {}
        with tf.name_scope('optimization'):
            self.network_optimization['y'] = tf.placeholder(tf.float32,
                                                            shape=(None, 1),
                                                            name='y')
            self.network_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.network['Q'],
                                      self.network_optimization['y']),
                name='loss')
            self.network_optimization['optimize'] = tf.train.AdamOptimizer(
                learning_rate).minimize(self.network_optimization['loss'])

        #define the operations for compute y value
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward +  (1-terminal) * gamma * V
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['v'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['y'] = tf.to_float(self.y_compute['t'])
            self.y_compute['y'] = tf.mul(self.y_compute['y'], -1.0)
            self.y_compute['y'] = tf.add(self.y_compute['y'], 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['v'], self.discount),
                       self.y_compute['y']))

        # define the soft update operation between the normal networks and target networks
        self.soft_update_list = []
        with tf.name_scope('soft_update'):
            for source, dest in zip(self.network['variables'],
                                    self.target['variables']):
                self.soft_update_list.append(
                    dest.assign(tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('naf.graph',
                                                graph_def=self.session.graph)

    def create_networks(self, is_target):

        scope = 'tar_naf' if is_target else 'naf'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')
            u = tf.placeholder(tf.float32,
                               shape=(None, self.action_dim),
                               name='actions')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            # define operations for the value function
            with tf.variable_scope('V'):
                V = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        V = fully_connected(inputs=V,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        V = batch_norm(inputs=V,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        V = fully_connected(inputs=V,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                # add in the last layer
                V = fully_connected(inputs=V,
                                    activation_fn=None,
                                    num_outputs=1)

            # define operations for compute covariance matrix
            with tf.variable_scope('L'):
                L = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        L = fully_connected(inputs=L,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        L = batch_norm(inputs=L,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        L = fully_connected(inputs=L,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                L = fully_connected(inputs=L,
                                    activation_fn=None,
                                    num_outputs=(self.action_dim *
                                                 (self.action_dim + 1) / 2))

                #construct upper triangular matrix U
                pivot = 0
                rows = []
                for index in xrange(self.action_dim):
                    count = self.action_dim - index

                    # slice one element at point pivot from the second dimension and apply exp to it
                    # NOTE, first dimension indicate the batch, -1 means all element in this dimension are in slice
                    diag_elem = tf.exp(tf.slice(L, (0, pivot), (-1, 1)))

                    # slice the next count - 1 element from the second dimension
                    # count is the number of non-zero element in each row
                    # NOTE: index getting bigger, so count get smaller
                    non_diag_elems = tf.slice(L, (0, pivot + 1),
                                              (-1, count - 1))

                    # concate the tensor to form one row of the matrix
                    non_zero_elements = tf.concat(1,
                                                  (diag_elem, non_diag_elems))

                    # ((0, 0), (index, 0)) is the paddings
                    # since we have two-d matrix, so the tuple has two elements
                    # for the first (0,0), specify the first dimension
                    # the first 0 means padding nothing, the second 0 means padding before the elements (-1 means after)
                    # (index, 0) specify the padding for second dimension, which is what we want
                    # (index, 0) mean padding index number before the elements
                    row = tf.pad(non_zero_elements, ((0, 0), (index, 0)))
                    rows.append(row)

                    # take off the elements we already used
                    pivot += count

                # Packs a list of rank-R tensors into one rank-(R+1) tensor.
                # axis = 1 mean the second dimensions
                # NOTE : this will get upper triangular matrix U not L
                L = tf.pack(rows, axis=1)

                # convariance matrix P = L*L^{T} = U^{T}*U
                P = tf.batch_matmul(tf.transpose(L, perm=[0, 2, 1]), L)

            # define operations for compute Mu
            with tf.variable_scope('M'):
                M = x
                # add in the hidden layers
                for hidden_unit_num in self.hidden_layers:
                    if self.batch_norm:
                        M = fully_connected(inputs=M,
                                            activation_fn=None,
                                            num_outputs=hidden_unit_num)
                        # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                        # see https://github.com/tensorflow/tensorflow/issues/1122
                        M = batch_norm(inputs=M,
                                       center=True,
                                       scale=True,
                                       activation_fn=tf.nn.relu,
                                       is_training=is_train,
                                       updates_collections=None)
                    else:
                        M = fully_connected(inputs=M,
                                            activation_fn=tf.nn.relu,
                                            num_outputs=hidden_unit_num)

                # add in the last layer
                M = fully_connected(inputs=M,
                                    activation_fn=tf.tanh,
                                    num_outputs=self.action_dim)

            #define operations for compute Advantage function
            with tf.name_scope('A'):
                # first expand the u-M to a 2-d tensor for multiplication
                # NOTE: it's actually a 3-d tensor, but we ignore the first dim which is the batch
                # u is two-d matrix, first dimension is the batch
                # so u is actually a row vector after expand_dim
                Aprime = tf.expand_dims(u - M, -1)
                # fix the dimension for batch, transpose each instance
                A = tf.transpose(Aprime, perm=[0, 2, 1])
                # A = -1/2 * (u-M)^{T} * P * (u-M)
                A = -tf.batch_matmul(tf.batch_matmul(Aprime, P), A) / 2
                # make sure the shape is batch_size *  1 for A, -1 mean that dim is automatically computed
                # after last step, each A is now a 1*1 matrix, we reshape it to get scalar
                A = tf.reshape(A, [-1, 1])

            with tf.name_scope('Q'):
                Q = A + V

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        #return x, u, is_train, V, P, M, Q, variables
        return x, u, is_train, V, P, M, Q, variables

    def predict_target_v(self, x):
        return self.session.run(self.target['V'],
                                feed_dict={self.target['x']: x})

    def get_y(self, v, r, t):
        return self.session.run(self.y_compute['y'],
                                feed_dict={
                                    self.y_compute['r']: r,
                                    self.y_compute['v']: v,
                                    self.y_compute['t']: t
                                })

    def optimize_network(self, x, u, is_train, y):
        if self.batch_norm:
            feed_dict = {
                self.network['x']: x,
                self.network['u']: u,
                self.network['is_train']: is_train,
                self.network_optimization['y']: y
            }
        else:
            feed_dict = {
                self.network['x']: x,
                self.network['u']: u,
                self.network_optimization['y']: y
            }

        return self.session.run(self.network_optimization['optimize'],
                                feed_dict=feed_dict)

    def predict_action(self, x, is_train):
        if self.batch_norm:
            feed_dict = {
                self.network['x']: x,
                self.network['is_train']: is_train
            }
        else:
            feed_dict = {self.network['x']: x}
        return self.session.run([self.network['M'], self.network['P']],
                                feed_dict=feed_dict)

    def get_action(self, s):

        s = np.reshape(s, (1, self.state_dim))

        a, covariance = self.predict_action(s, False)

        return self.exploration.add_noise(a[0], self.action_lb, self.action_ub)

    def soft_update(self):
        self.session.run(self.soft_update_list)

    def learn(self, s, a, sprime, r, terminal):
        # first add the sample to the replay buffer
        self.replay_buffer.add(s, a, sprime, r, terminal)

        # we start learning if we have enough sample for one minibatch
        if self.replay_buffer.get_size() > self.mini_batch_size:

            # we do the update with several batch in each turn
            for i in xrange(self.update_per_iteration):
                state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch(
                    self.mini_batch_size)

                # compute V'
                v = self.predict_target_v(sprime_set)

                # compute y = r + gamma * V'
                y = self.get_y(v, reward_set, terminal_set)

                # optimize critic using y, and batch normalization
                self.optimize_network(state_set, action_set, True, y)

                # using soft update to update target networks
                self.soft_update()

    def reset_exploration(self):
        self.exploration.reset()
Example #2
0
class DDPG(object):
    def __init__(self,
                 sess,
                 env,
                 state_dim,
                 action_dim,
                 max_buffer_size=100000,
                 update_per_iteration=5,
                 mini_batch_size=64,
                 discount=0.99,
                 batch_norm=True,
                 actor_learning_rate=0.0001,
                 critic_learning_rate=0.001,
                 tau=0.001,
                 hidden_layers=[400, 300]):

        self.session = sess
        self.env = env
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_lb = self.env.action_space.low
        self.action_ub = self.env.action_space.high
        self.discount = discount
        self.batch_norm = batch_norm
        self.mini_batch_size = mini_batch_size
        self.update_per_iteration = update_per_iteration
        self.hidden_layers = hidden_layers

        self.replay_buffer = ReplayBuffer(max_buffer_size, state_dim,
                                          action_dim)
        self.exploration = OUProcess(self.action_dim)

        # we define the operations that is used in this algorithms
        self.critic = {}
        self.critic['x'], self.critic['u'], self.critic[
            'is_train'], self.critic['q'], self.critic[
                'variables'] = self.create_critic_network(is_target=False)

        self.target_critic = {}
        self.target_critic['x'], self.target_critic[
            'u'], _, self.target_critic['q'], self.target_critic[
                'variables'] = self.create_critic_network(is_target=True)

        self.actor = {}
        self.actor['x'], self.actor['is_train'], self.actor['a'], self.actor[
            'variables'] = self.create_actor_network(is_target=False)

        self.target_actor = {}
        self.target_actor['x'], _, self.target_actor['a'], self.target_actor[
            'variables'] = self.create_actor_network(is_target=True)

        self.critic_optimization = {}
        with tf.name_scope('critic_optimization'):
            self.critic_optimization['y'] = tf.placeholder(tf.float32,
                                                           shape=(None, 1),
                                                           name='y')
            self.critic_optimization['loss'] = tf.reduce_mean(
                tf.squared_difference(self.critic['q'],
                                      self.critic_optimization['y']),
                name='loss')
            self.critic_optimization['optimize'] = tf.train.AdamOptimizer(
                critic_learning_rate).minimize(
                    self.critic_optimization['loss'])

        # define operation to get y
        self.y_compute = {}
        with tf.name_scope('y'):
            # y = reward + (1-terminal) * gamma * target_q
            self.y_compute['r'] = tf.placeholder(tf.float32, shape=(None, 1))
            self.y_compute['t'] = tf.placeholder(tf.int8, shape=(None, 1))
            self.y_compute['q'] = tf.placeholder(tf.float32, shape=(None, 1))
            temp = tf.to_float(self.y_compute['t'])
            temp = tf.mul(temp, -1.0)
            temp = tf.add(temp, 1.0)
            self.y_compute['y'] = tf.add(
                self.y_compute['r'],
                tf.mul(tf.mul(self.y_compute['q'], self.discount), temp))

        # define the operation to get the gradient of Q with respect to action
        self.action_gradients = {}
        with tf.name_scope('action_grads'):
            self.action_gradients["action_grads"] = tf.gradients(
                self.critic['q'], self.critic['u'])

        self.actor_optimization = {}
        with tf.name_scope('actor_optimization'):
            # first define the placeholder for the gradient of Q with respect to action
            self.actor_optimization['action_grads'] = tf.placeholder(
                tf.float32, shape=(None, self.action_dim))
            # since actor are using gradient ascent, we add the minus sign
            self.actor_optimization['actor_variable_grads'] = tf.gradients(
                self.actor['a'], self.actor['variables'],
                -self.actor_optimization['action_grads'])
            self.actor_optimization['optimize'] = tf.train.AdamOptimizer(
                actor_learning_rate).apply_gradients(
                    zip(self.actor_optimization['actor_variable_grads'],
                        self.actor['variables']))

        self.soft_update_list = []
        with tf.name_scope("soft_update"):
            for source, dest in zip(self.critic['variables'],
                                    self.target_critic['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))
            for source, dest in zip(self.actor['variables'],
                                    self.target_actor['variables']):
                if 'BatchNorm' not in source.name:
                    self.soft_update_list.append(
                        dest.assign(
                            tf.mul(source, tau) + tf.mul(dest, 1.0 - tau)))

        # after define the computation, we initialize all the varialbes
        self.session.run(tf.initialize_all_variables())

        summary_writer = tf.train.SummaryWriter('critic.graph',
                                                graph_def=self.session.graph)

    def create_actor_network(self, is_target):

        scope = 'tar_actor' if is_target else 'actor'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            net = x

            for hidden_unit_num in self.hidden_layers:
                if self.batch_norm:
                    net = fully_connected(inputs=net,
                                          activation_fn=None,
                                          num_outputs=hidden_unit_num)
                    # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                    net = batch_norm(inputs=net,
                                     center=True,
                                     scale=True,
                                     activation_fn=tf.nn.relu,
                                     is_training=is_train,
                                     updates_collections=None)
                else:
                    net = fully_connected(inputs=net,
                                          activation_fn=tf.nn.relu,
                                          num_outputs=hidden_unit_num)

            net = fully_connected(
                inputs=net,
                activation_fn=tf.tanh,
                num_outputs=self.action_dim,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        return x, is_train, net, variables

    def create_critic_network(self, is_target):
        scope = 'tar_critic' if is_target else 'critic'

        with tf.variable_scope(scope):
            x = tf.placeholder(tf.float32,
                               shape=(None, self.state_dim),
                               name='observation')
            u = tf.placeholder(tf.float32,
                               shape=(None, self.action_dim),
                               name='actions')

            # this is used for determine which mode, training or evalutation, for batch normalization
            if self.batch_norm:
                # for target network, is alway evaluation mode
                is_train = False if is_target else tf.placeholder(
                    tf.bool, name='is_train')
            else:
                is_train = None

            # first concatenate the input
            # NOTE : this is different architecture from the original paper, we include the action from the first layer
            with tf.name_scope('merge'):
                net = tf.concat(1, [x, u])

            for hidden_unit_num in self.hidden_layers:
                if self.batch_norm:
                    net = fully_connected(inputs=net,
                                          activation_fn=None,
                                          num_outputs=hidden_unit_num)
                    # NOTE : we set the updates_collections to None to force the updates of mean and variance in place
                    net = batch_norm(inputs=net,
                                     center=True,
                                     scale=True,
                                     activation_fn=tf.nn.relu,
                                     is_training=is_train,
                                     updates_collections=None)
                else:
                    net = fully_connected(inputs=net,
                                          activation_fn=tf.nn.relu,
                                          num_outputs=hidden_unit_num)

            net = fully_connected(
                inputs=net,
                activation_fn=None,
                num_outputs=1,
                weights_initializer=tf.random_uniform_initializer(-3e-3, 3e-3),
                biases_initializer=tf.random_uniform_initializer(-3e-3, 3e-3))

        # get all the trainable variable from this scope
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                      scope=scope)

        return x, u, is_train, net, variables

    # define the functions for executing operations
    def predict_target_q(self, x, u):
        return self.session.run(self.target_critic['q'],
                                feed_dict={
                                    self.target_critic['x']: x,
                                    self.target_critic['u']: u
                                })

    def predict_target_action(self, x):
        return self.session.run(self.target_actor['a'],
                                feed_dict={self.target_actor['x']: x})

    def get_y(self, q, r, t):
        return self.session.run(self.y_compute['y'],
                                feed_dict={
                                    self.y_compute['r']: r,
                                    self.y_compute['q']: q,
                                    self.y_compute['t']: t
                                })

    def optimize_critic(self, x, u, is_train, y):
        if self.batch_norm:
            return self.session.run(self.critic_optimization['optimize'],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic['is_train']: is_train,
                                        self.critic_optimization['y']: y
                                    })
        else:
            return self.session.run(self.critic_optimization['optimize'],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic_optimization['y']: y
                                    })

    def predict_action(self, x, is_train):
        if self.batch_norm:
            return self.session.run(self.actor['a'],
                                    feed_dict={
                                        self.actor['x']: x,
                                        self.actor['is_train']: is_train
                                    })
        else:
            return self.session.run(self.actor['a'],
                                    feed_dict={self.actor['x']: x})

    def action_grads(self, x, u, is_train):
        if self.batch_norm:
            return self.session.run(self.action_gradients["action_grads"],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u,
                                        self.critic['is_train']: is_train
                                    })
        else:
            return self.session.run(self.action_gradients["action_grads"],
                                    feed_dict={
                                        self.critic['x']: x,
                                        self.critic['u']: u
                                    })

    def optimize_actor(self, x, a_grads, is_train):
        if self.batch_norm:
            return self.session.run(
                self.actor_optimization['optimize'],
                feed_dict={
                    self.actor['x']: x,
                    self.actor['is_train']: is_train,
                    self.actor_optimization['action_grads']: a_grads
                })
        else:
            return self.session.run(
                self.actor_optimization['optimize'],
                feed_dict={
                    self.actor['x']: x,
                    self.actor_optimization['action_grads']: a_grads
                })

    def soft_update(self):
        self.session.run(self.soft_update_list)

    def get_action(self, s):

        # first make sure the s have the valid form
        s = np.reshape(s, (1, self.state_dim))

        a = self.predict_action(s, False)

        # a is a list with mini_batch size of 1, so we need the first element of is_train
        return self.exploration.add_noise(a[0], self.action_lb, self.action_ub)

    def learn(self, s, a, sprime, r, t):

        # first add the sample to the replay buffer
        self.replay_buffer.add(s, a, sprime, r, t)

        # we start learning if we have enough sample for one minibatch
        if self.replay_buffer.get_size() > self.mini_batch_size:

            # we do the update with several batch in each turn
            for i in xrange(self.update_per_iteration):
                state_set, action_set, sprime_set, reward_set, terminal_set = self.replay_buffer.sample_batch(
                    self.mini_batch_size)

                # first optimize the critic
                # compute Q'
                q = self.predict_target_q(
                    sprime_set, self.predict_target_action(sprime_set))

                # compute y = r + gamma * Q'
                y = self.get_y(q, reward_set, terminal_set)

                # optimize critic using y, and batch normalization
                self.optimize_critic(state_set, action_set, True, y)

                # then optimize the actor
                actions = self.predict_action(state_set, True)
                a_grads = self.action_grads(state_set, actions, False)
                # NOTE: the tf.gradient return a list of len(actions), so we need to take the first element from it
                self.optimize_actor(state_set, a_grads[0], True)

                # using soft update to update target networks
                self.soft_update()

    def reset_exploration(self):
        self.exploration.reset()
Example #3
0
def train(file_name):
    # Create folders.
    if not os.path.isdir(SAVE_DIR):
        os.makedirs(SAVE_DIR)
    if not os.path.isdir(FIGURE_TRAINING_DIR):
        os.makedirs(FIGURE_TRAINING_DIR)

    # Obtain environment parameters.
    env = make_atari(ENV_NAME)
    obs_space = env.observation_space
    action_space = env.action_space
    env.close()

    # Build networks.
    main_network = QValueNetwork(obs_space, action_space, name="main_network")
    target_network = QValueNetwork(obs_space,
                                   action_space,
                                   name="target_network",
                                   auxiliary_network=main_network)
    variables_initializer = tf.global_variables_initializer()

    # Create parallel environments.
    par_env = ParallelEnvironment(
        [make_atari(ENV_NAME) for _ in range(NUM_ENV)])

    replay_buffer = ReplayBuffer(buffer_size=BUFFER_SIZE)
    start_time = time.time()
    list_episodic_reward = []
    episodic_reward = np.zeros(NUM_ENV)

    obs = par_env.reset()

    with tf.Session() as sess:
        # Initialize all variables.
        sess.run(variables_initializer)
        # Only save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)

        # Initialize buffers.
        while replay_buffer.get_size() < INITIAL_BUFFER_SIZE:
            # Sample random action.
            action = np.random.randint(action_space.n, size=NUM_ENV)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

        step = 0
        next_target_network_update_step = 0
        next_autosave_step = 0
        while step < TOTAL_STEP:
            # Synchronize the target network periodically (target network <- main network).
            if step >= next_target_network_update_step:
                sess.run(target_network.sync_op)
                next_target_network_update_step += TARGET_NETWORK_UPDATE_STEP

            # Sample action with epsilon-greedy policy.
            epsilon = EPSILON_MAX - (EPSILON_MAX - EPSILON_MIN) * np.minimum(
                step / EPSILON_DECAY_STEP, 1)
            random_uniform = np.random.uniform(size=NUM_ENV)
            action = np.zeros(NUM_ENV, dtype=np.int32)
            random_action_index = np.argwhere(random_uniform < epsilon)
            if np.shape(random_action_index)[0] > 0:
                action[tuple(
                    np.transpose(random_action_index))] = np.random.randint(
                        action_space.n, size=np.shape(random_action_index)[0])
            greedy_action_index = np.argwhere(random_uniform >= epsilon)
            if np.shape(greedy_action_index)[0] > 0:
                q = sess.run(target_network.q,
                             feed_dict={
                                 target_network.Obs:
                                 np.array(obs)[tuple(
                                     np.transpose(greedy_action_index))] /
                                 255.0
                             })
                action[tuple(np.transpose(greedy_action_index))] = np.argmax(
                    q, axis=1)
            # Interact with the environment.
            obs_next, reward, done, _ = par_env.step(action)
            episodic_reward += reward
            for i in range(NUM_ENV):
                if done[i]:
                    list_episodic_reward.append((step, episodic_reward[i]))
                    delta_time = int(time.time() - start_time)
                    print("Step ",
                          step,
                          "/",
                          TOTAL_STEP,
                          ": Time spent = ",
                          delta_time,
                          " s , Episodic reward = ",
                          episodic_reward[i],
                          sep="")
                    episodic_reward[i] = 0
            # Store data.
            for i in range(NUM_ENV):
                data = [obs[i], action[i], reward[i], done[i], obs_next[i]]
                replay_buffer.append(data)
            # Update observation.
            obs = obs_next

            # Learning rate.
            lr = LEARNING_RATE[-1]
            for i in range(len(LR_ANNEAL_STEP)):
                if step < LR_ANNEAL_STEP[i]:
                    lr = LEARNING_RATE[i]
                    break

            for _ in range(NUM_ENV):
                # Sample training data from the replay buffer.
                batch_data = replay_buffer.sample(BATCH_SIZE)
                batch_obs, batch_action, batch_reward, batch_done, batch_obs_next = \
                  [np.array([batch_data[j][i] for j in range(BATCH_SIZE)]) for i in range(len(batch_data[0]))]

                # Compute the target Q value:
                #   target_q = r + (1 - done) * REWARD_DISCOUNT * max[q(s', a)]
                q_next = sess.run(
                    target_network.q,
                    feed_dict={target_network.Obs: batch_obs_next / 255.0})
                max_qnext = np.amax(q_next, axis=1)
                target_q = batch_reward + (
                    1 - batch_done) * REWARD_DISCOUNT * max_qnext

                # Update the main network.
                sess.run(main_network.train_op,
                         feed_dict={
                             main_network.Obs: batch_obs / 255.0,
                             main_network.Action: batch_action,
                             main_network.TargetQ: target_q,
                             main_network.LR: lr
                         })

            # Save the main network periodically.
            if step >= next_autosave_step:
                saver.save(sess, SAVE_DIR + file_name)
                next_autosave_step += AUTOSAVE_STEP

            # Update step.
            step += NUM_ENV

        # Save the main network.
        saver = tf.train.Saver(var_list=main_network.variables)
        saver.save(sess, SAVE_DIR + file_name)

    total_time = int(time.time() - start_time)
    print("Training finished in ", total_time, " s.", sep="")

    # Close the environment.
    par_env.close()

    # Plot the episodic reward against training step curve.
    plot_episodic_reward(list_episodic_reward, file_name)