コード例 #1
0
    def learn(self, obs, action, reward):
        act_prob = self.model(obs)
        # log_prob = layers.cross_entropy(act_prob, action)
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) * layers.one_hot(
                action, act_prob.shape[1]),
            dim=1)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost
コード例 #2
0
 def learn(self, obs, action, reward):
     """ 用policy gradient 算法更新policy model
     """
     act_prob = self.model(obs)  # 获取输出动作概率
     # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
     log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) *
                                  layers.one_hot(action, act_prob.shape[1]),
                                  dim=1)
     cost = log_prob * reward
     cost = layers.reduce_mean(cost)
     print('====loss', cost)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(cost)
     return cost
コード例 #3
0
ファイル: algo.py プロジェクト: Feynman1999/myRL
    def learn(self, obs, action, reward):
        """

        :param obs:      [B,4]
        :param action:   [B,1]
        :param reward:   [B,]
        :return:
        """
        act_prob = self.model(obs)  # [B,2]
        # [B, 2] -> [B, ]
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) *
            layers.one_hot(action, depth=act_prob.shape[1]),
            dim=1,
            keep_dim=False)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost