def generate_advantage(data_dict, baseline_network):
    '''
        @brief: calculate the parameters for the advantage function
    '''

    for path in data_dict:
        # the predicted value function (baseline function)
        path["baseline"] = baseline_network.predict(path)

    advantage_method = baseline_network.args.advantage_method
    gamma = baseline_network.args.gamma
    gae_lam = baseline_network.args.gae_lam

    # esitmate the advantages
    if advantage_method == 'raw':
        for path in data_dict:
            # the gamma discounted rollout value function
            path["returns"] = utils.discount(path["rewards"], gamma)
            path["advantage"] = path["returns"] - path["baseline"]
            path['target_return'] = path['returns']
    else:
        assert advantage_method == 'gae', logger.error(
            'invalid advantage estimation method: {}'.format(advantage_method))

        for path in data_dict:
            # the gamma discounted rollout value function
            path["returns"] = utils.discount(path["rewards"], gamma)

            # init the advantage
            path["advantage"] = np.zeros(path['returns'].shape)

            num_steps = len(path['returns'])

            # generate the GAE advantage
            for i_step in reversed(range(num_steps)):
                if i_step < num_steps - 1:
                    delta = path['rewards'][i_step] \
                        + gamma * path['baseline'][i_step + 1] \
                        - path['baseline'][i_step]

                    path['advantage'][i_step] = \
                        delta + gamma * gae_lam * path['advantage'][i_step + 1]
                else:
                    delta = path['rewards'][i_step] - path['baseline'][i_step]
                    path['advantage'][i_step] = delta

            path['target_return'] = path['advantage'] + path['baseline']

    # standardized advantage function
    advant_n = np.concatenate([path["advantage"] for path in data_dict])
    advant_n -= advant_n.mean()
    advant_n /= (advant_n.std() + 1e-8)  # standardize to mean 0 stddev 1
    return advant_n
    def generate_advantage(self, data_dict, feed_dict):
        '''
            @brief: calculate the parameters for the advantage function
        '''
        # get the baseline function
        if self.args.use_gnn_as_value:
            baseline_data = self.baseline_network.predict(feed_dict)
            current_id = 0
            for path in data_dict:
                path['baseline'] = baseline_data[current_id:current_id +
                                                 len(path['rewards'])]
                current_id += len(path['rewards'])

            assert current_id == len(baseline_data), logger.error(
                'Extra baseline predicted? ({} vs {})'.format(
                    current_id, len(baseline_data)))
        else:
            for path in data_dict:
                # the predicted value function (baseline function)
                path["baseline"] = self.baseline_network.predict(path)

        # esitmate the advantages
        if self.args.advantage_method == 'raw':
            for path in data_dict:
                # the gamma discounted rollout value function
                path["returns"] = utils.discount(path["rewards"],
                                                 self.args.gamma)
                path["advantage"] = path["returns"] - path["baseline"]
                path['target_return'] = path['returns']
        else:
            assert self.args.advantage_method == 'gae', logger.error(
                'invalid advantage estimation method: {}'.format(
                    self.args.advantage_method))

            for path in data_dict:
                # the gamma discounted rollout value function
                path["returns"] = utils.discount(path["rewards"],
                                                 self.args.gamma)

                # init the advantage
                path["advantage"] = np.zeros(path['returns'].shape)

                num_steps = len(path['returns'])

                # generate the GAE advantage
                for i_step in reversed(range(num_steps)):
                    if i_step < num_steps - 1:
                        delta = path['rewards'][i_step] \
                            + self.args.gamma * path['baseline'][i_step + 1] \
                            - path['baseline'][i_step]
                        path['advantage'][i_step] = \
                            delta + self.args.gamma * self.args.gae_lam \
                            * path['advantage'][i_step + 1]
                    else:
                        delta = path['rewards'][i_step] \
                            - path['baseline'][i_step]
                        path['advantage'][i_step] = delta

                path['target_return'] = path['advantage'] + path['baseline']

        # standardized advantage function
        advant_n = np.concatenate([path["advantage"] for path in data_dict])
        advant_n -= advant_n.mean()
        advant_n /= (advant_n.std() + 1e-8)  # standardize to mean 0 stddev 1
        return advant_n