Exemple #1
0
    def train(self, obs0, actions, rewards, obs1, dones, importance_weights):
        with tf.GradientTape() as tape:
            q_t = self.q_network(obs0)
            q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.num_actions, dtype=tf.float32), 1)

            q_tp1 = self.target_q_network(obs1)

            if self.double_q:
                q_tp1_using_online_net = self.q_network(obs1)
                q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
                q_tp1_best = tf.reduce_sum(
                    q_tp1 * tf.one_hot(q_tp1_best_using_online_net, self.num_actions, dtype=tf.float32), 1)
            else:
                q_tp1_best = tf.reduce_max(q_tp1, 1)

            dones = tf.cast(dones, q_tp1_best.dtype)
            q_tp1_best_masked = (1.0 - dones) * q_tp1_best

            q_t_selected_target = rewards + self.gamma * q_tp1_best_masked

            td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
            errors = huber_loss(td_error)
            weighted_error = tf.reduce_mean(importance_weights * errors)

        grads = tape.gradient(weighted_error, self.q_network.trainable_variables)
        if self.grad_norm_clipping:
            clipped_grads = []
            for grad in grads:
                clipped_grads.append(tf.clip_by_norm(grad, self.grad_norm_clipping))
            clipped_grads = grads
        grads_and_vars = zip(grads, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(grads_and_vars)

        return td_error
Exemple #2
0
    def nstep_loss(self, obses_t, actions, rewards, weights, agent_id):
        # print(f'obses_t.shape {obses_t.shape}')
        s = obses_t.shape
        obses_t = tf.reshape(obses_t, (s[0]*s[1], *s[2:]))
        # print(f'obses_t.shape {obses_t.shape}')
        s = actions.shape
        actions = tf.reshape(actions, (s[0] * s[1], *s[2:]))
        # print(f'actions.shape {actions.shape}')
        s = rewards.shape
        rewards = tf.reshape(rewards, (s[0] * s[1], *s[2:]))
        # print(f'rewards.shape {rewards.shape}')
        s = weights.shape
        weights = tf.reshape(weights, (s[0] * s[1], *s[2:]))
        # print(f'weights.shape {weights.shape}')

        inputs = {0: obses_t, 1: tf.tile(self.one_hot_agents[agent_id], (s[0]*s[1], 1))}
        fc_values = self.model(inputs)
        q_t = self.agent_heads[agent_id](fc_values)

        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.config.num_actions, dtype=tf.float32), 1)
        # print(f'q_t_selected.shape is {q_t_selected.shape}')

        td_error = q_t_selected - tf.stop_gradient(rewards)

        errors = huber_loss(td_error)
        weighted_loss = tf.reduce_mean(weights * errors)

        return weighted_loss, td_error
Exemple #3
0
    def nstep_train(self, obs0, actions, rewards, obs1, dones,
                    importance_weights, fps, extra_datas):
        batch_size = obs0.shape[0]
        # tile_time = batch_size // self.num_agents
        # td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size))
        loss = []
        td_error_ = []
        with tf.GradientTape() as tape:
            for a in self.agent_ids:
                fc_values = self.value_network({
                    0:
                    obs0[:, a, :],
                    1:
                    tf.tile(self.one_hot_agents[a], (batch_size, 1)),
                    2:
                    fps[:, a, :],
                    3:
                    extra_datas[:, a, :]
                })

                q_t = self.q_fc_list[a](fc_values)
                # print(f'q_values for agent {a} is {q_t}')

                q_t_selected = tf.reduce_sum(
                    q_t * tf.one_hot(
                        actions[:, a], self.num_actions, dtype=tf.float32), 1)
                # print(f'q_t_selected is {q_t_selected.numpy()}')

                q_t_selected_target = rewards[:, a]  # n-step rewards sum
                # print(f'q_t_selected_target is {q_t_selected_target}')

                td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)

                td_error_.append(td_error.numpy())
                errors = huber_loss(td_error)
                weighted_error = tf.reduce_mean(importance_weights[:, a] *
                                                errors)

                loss.append(weighted_error)

            sum_loss = tf.reduce_mean(loss)

        # param = tape.watched_variables()
        param = self.value_network.trainable_variables
        for a in self.agent_ids:
            param += self.q_fc_list[a].trainable_variables

        # print(f'param is {param}')
        print(f'loss is {loss}')
        print(f'sum_loss is {sum_loss}')
        grads = tape.gradient(sum_loss, param)
        # grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)]
        # print(f'grads is {grads}')
        grads_and_vars = list(zip(grads, param))
        self.optimizer.apply_gradients(grads_and_vars)

        return np.mean(td_error_)
Exemple #4
0
    def nstep_loss(self, obses_t_a, actions_a, rewards_a, dones_a, weights_a,
                   fps_a, agent_id):
        # print(f'obses_t.shape {obses_t.shape}')
        s = obses_t_a.shape
        obses_t_a = tf.reshape(obses_t_a, (s[0] * s[1], *s[2:]))
        # print(f'obses_t_a.shape {obses_t_a.shape}')
        s = actions_a.shape
        actions_a = tf.reshape(actions_a, (s[0], s[1], *s[2:]))
        # print(f'actions_a.shape {actions_a.shape}')
        s = rewards_a.shape
        rewards_a = tf.reshape(rewards_a, (s[0], s[1], *s[2:]))
        # print(f'rewards_a.shape {rewards_a.shape}')
        s = dones_a.shape
        # print(f's {s}')
        dones_a = tf.reshape(dones_a, (s[0], s[1], 1))
        # print(f'dones_a.shape {dones_a.shape}')
        s = weights_a.shape
        weights_a = tf.reshape(weights_a, (s[0], s[1], *s[2:]))
        # print(f'weights_a.shape {weights_a.shape}')
        s = fps_a.shape
        fps_a = tf.reshape(fps_a, (s[0] * s[1], *s[2:]))
        # print(f'fps_a.shape {fps_a.shape}')

        inputs_a = {
            '0': obses_t_a,
            '1': tf.tile(self.one_hot_agents[agent_id], (s[0] * s[1], 1)),
            '2': fps_a,
            '3': dones_a
        }

        fc_values = self.model(inputs_a)
        # s = fc_values.shape
        # print(f'fc_values.shape {fc_values.shape}')
        # fc_values = tf.reshape(fc_values, (s[0] * s[1], *s[2:]))
        q_t = self.agent_heads[agent_id](fc_values)

        q_t_selected = tf.reduce_sum(
            q_t * tf.one_hot(
                actions_a[:, -1], self.config.num_actions, dtype=tf.float32),
            1)
        # print(f'q_t_selected.shape is {q_t_selected.shape}')

        td_error = q_t_selected - tf.stop_gradient(rewards_a[:, -1])

        errors = huber_loss(td_error)
        weighted_loss = tf.reduce_mean(weights_a[:, -1] * errors)

        return weighted_loss, td_error
Exemple #5
0
    def train(self, obs0, actions, rewards, obs1, dones, importance_weights, fps, extra_datas):
        batch_size = obs0.shape[0]
        td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size))
        for a in self.agent_ids:
            with tf.GradientTape() as tape:
                fc_values = self.value_network({0: obs0[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a})
                q_t = self.q_fc_list[a](fc_values)

                q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions[:, a], self.num_actions, dtype=tf.float32), 1)

                fc_tp1 = self.target_network({0: obs1[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a})
                q_tp1 = self.target_q_fc_list[a](fc_tp1)

                if self.double_q:
                    fc_tp1_using_online_net = self.value_network({0: obs1[:, a, :], 1: tf.ones(shape=(batch_size, 1)) * a})
                    q_tp1_using_online_net = self.q_fc_list[a](fc_tp1_using_online_net)
                    q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
                    q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, self.num_actions, dtype=tf.float32), 1)
                else:
                    q_tp1_best = tf.reduce_max(q_tp1, 1)

                dones = tf.cast(dones, q_tp1_best.dtype)
                q_tp1_best_masked = (1.0 - dones) * q_tp1_best

                q_t_selected_target = rewards[:, a] + self.gamma * q_tp1_best_masked

                td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
                td_error_.assign_add(td_error)
                errors = huber_loss(td_error)
                weighted_error = tf.reduce_mean(importance_weights[:, a] * errors)

                # loss.assign_add(weighted_error)

                param = tape.watched_variables()

                # print(param)
                # param = [v for v in self.q_network.trainable_variables if v.name.__contains__(agent_name)]
                # param += [v for v in self.q_network.trainable_variables if not v.name.__contains__('agent')]
                # print(f'params for is {param}')

                grads = tape.gradient(weighted_error, param)
                grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)]

                grads_and_vars = list(zip(grads, param))
                self.optimizer.apply_gradients(grads_and_vars)

        return td_error
Exemple #6
0
    def nstep_loss(self, obses_t_a, actions_a, rewards_a, dones_a, weights_a,
                   fps_a, agent_id):
        # print(f'obses_t_a.shape {obses_t_a.shape}')
        q_t = self.network.value(obses_t_a, fps_a, agent_id)

        q_t_selected = tf.reduce_sum(
            q_t *
            tf.one_hot(actions_a, self.config.num_actions, dtype=tf.float32),
            1)
        # print(f'q_t_selected.shape is {q_t_selected.shape}')

        td_error = q_t_selected - tf.stop_gradient(rewards_a)

        errors = huber_loss(td_error)
        weighted_loss = tf.reduce_mean(weights_a * errors)

        return weighted_loss, td_error
Exemple #7
0
def train(batch_x, batch_y, shared_network, opt):
    with tf.GradientTape() as tape:
        q_eval_arr = shared_network(batch_x)['agent_0']
        # print('q_eval_arr ', q_eval_arr)
        one_hot = tf.one_hot(batch_y, 4)
        # print(one_hot)
        q_t_selected = tf.reduce_sum(q_eval_arr * one_hot, 1)
        target_q_values = shared_network(
            batch_x + np.random.normal(0, 1, batch_x.shape))['agent_0']
        max_target_q_values = tf.reduce_max(target_q_values, axis=1)
        td_error = q_t_selected - max_target_q_values
        # print('td_error ', td_error)
        errors = huber_loss(td_error)
        loss = tf.reduce_mean(errors)

    # param = shared_network.trainable_variables
    param = [
        v for v in shared_network.trainable_variables
        if v.name.__contains__('agent_0')
    ]
    gradients_of_network = tape.gradient(loss, param)
    opt.apply_gradients(zip(gradients_of_network, param))
Exemple #8
0
    def nstep_train(self, obs0, actions, rewards, obs1, dones, importance_weights, fps, extra_datas):
        batch_size = obs0.shape[0]
        # tile_time = batch_size // self.num_agents
        td_error_ = tf.Variable(initial_value=tf.zeros(shape=batch_size//self.n_step))
        loss = tf.Variable(initial_value=0.0)
        with tf.GradientTape() as tape:
            for a in self.agent_ids:
                # print(f'obs0[:, a, :] shape is {obs0[:, a, :].shape}')
                # print(f'tf.tile(self.one_hot_agents[a], (batch_size, 1)) shape is {tf.tile(self.one_hot_agents[a], (batch_size, 1)).shape}')
                # print(f'fps[:, a, :] shape is {fps[:, a, :].shape}')
                # print(f'extra_datas[:, a, :] shape is {extra_datas[:, a, :].shape}')
                # print(f'dones shape is {dones.shape}')

                fc_values = self.value_network({0: obs0[:, a, :],
                                                1: tf.tile(self.one_hot_agents[a], (batch_size, 1)),
                                                2: fps[:, a, :],
                                                3: tf.expand_dims(extra_datas[:, a, :], axis=1),
                                                4: dones})

                q_t = self.q_fc_list[a](fc_values)
                # print(f'q_values.shape for agent {a} is {q_t.shape}')
                # print(f'tf.one_hot(actions[:, a], self.num_actions, dtype=tf.float32).shape for agent {a} is {tf.one_hot(actions[-1, a], self.num_actions, dtype=tf.float32).shape}')

                q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions[-1, a], self.num_actions, dtype=tf.float32), 1)
                # print(f'q_t_selected.dtype is {q_t_selected.dtype}')

                q_t_selected_target = rewards[-1, a]  # n-step rewards sum
                # print(f'q_t_selected_target.dtype is {q_t_selected_target.dtype}')

                td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)

                td_error_.assign_add(td_error)
                errors = huber_loss(td_error)
                weighted_error = tf.reduce_mean(importance_weights[-1, a] * errors)

                loss.assign_add(weighted_error)

        # # print(f'tile_time is {tile_time}')
        # with tf.GradientTape() as tape:
        #     fc_values = self.value_network({0: obs0,
        #                                     1: tf.tile(self.one_hot_agents, (tile_time, 1, 1)),
        #                                     2: fps,
        #                                     3: extra_datas}
        #                                    )
        #     for a in self.agent_ids:
        #
        #         q_t = self.q_fc_list[a](fc_values[])
        #     # print(f'q_values for agent {a} is {q_t}')
        #
        #     q_t_selected = tf.reduce_sum(q_t * tf.one_hot(actions, self.num_actions, dtype=tf.float32), 1)
        #     # print(f'q_t_selected.dtype is {q_t_selected.dtype}')
        #
        #     q_t_selected_target = rewards  # n-step rewards sum
        #     # print(f'q_t_selected_target.dtype is {q_t_selected_target.dtype}')
        #
        #     td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        #
        #     errors = huber_loss(td_error)
        #     weighted_error = tf.reduce_mean(importance_weights * errors)

        param = tape.watched_variables()
        grads = tape.gradient(loss, param)
        grads = [grad if grad is not None else tf.zeros_like(var) for var, grad in zip(param, grads)]

        grads_and_vars = list(zip(grads, param))
        self.optimizer.apply_gradients(grads_and_vars)

        return td_error_