Ejemplo n.º 1
0
    def eps_greedy_sampling(self, scores, mask, eps):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def get_greedy_prob(scores_padded, mask_padded):
            s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE
            max_value = layers.reduce_max(s, dim=1, keep_dim=True)
            greedy_prob = layers.cast(s >= max_value, 'float32')
            return greedy_prob

        greedy_prob = get_greedy_prob(scores_padded, mask_padded)
        eps_prob = mask_padded * eps / layers.reduce_sum(
            mask_padded, dim=1, keep_dim=True)

        final_prob = (greedy_prob + eps_prob) * mask_padded
        final_prob = final_prob / layers.reduce_sum(
            final_prob, dim=1, keep_dim=True)

        sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Ejemplo n.º 2
0
 def normalize(scores_padded, mask_padded):
     mean_S = layers.reduce_sum(scores_padded, dim=1,
                                keep_dim=True) / layers.reduce_sum(
                                    mask_padded, dim=1, keep_dim=True)
     S = scores_padded - mean_S
     std_S = layers.sqrt(
         layers.reduce_sum(layers.square(S * mask_padded),
                           dim=1,
                           keep_dim=True))
     return S / (std_S + self.SAFE_EPS)
Ejemplo n.º 3
0
 def get_seq_len_mask(input, atten_input, max_seq_len,
                      max_atten_seq_len):
     ones = layers.reduce_sum(
         input, dim=1, keep_dim=True) * 0 + 1  # (batch*seq_len, 1)
     atten_ones = layers.reduce_sum(atten_input, dim=1,
                                    keep_dim=True) * 0 + 1
     ones_padded = fluid_sequence_pad(
         ones, 0, max_seq_len)  # (batch, seq_len, 1)
     atten_ones_padded = fluid_sequence_pad(atten_ones, 0,
                                            max_atten_seq_len)
     seq_len_mask = layers.matmul(
         ones_padded, layers.transpose(atten_ones_padded,
                                       perm=[0, 2, 1]))
     seq_len_mask.stop_gradient = True
     return seq_len_mask  # (batch, seq_len, atten_seq_len)
Ejemplo n.º 4
0
    def _ensemble_predict(self, obs):
        actor_outputs = []
        for i in range(self.ensemble_num):
            actor_outputs.append(self.actors[i].predict(obs))
        batch_actions = layers.concat(actor_outputs, axis=0)
        batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])

        critic_outputs = []
        for i in range(self.ensemble_num):
            critic_output = self.critics[i].predict(batch_obs, batch_actions)
            critic_output = layers.unsqueeze(critic_output, axes=[1])
            critic_outputs.append(critic_output)
        score_matrix = layers.concat(critic_outputs, axis=1)

        # Normalize scores given by each critic
        sum_critic_score = layers.reduce_sum(
            score_matrix, dim=0, keep_dim=True)
        sum_critic_score = layers.expand(
            sum_critic_score, expand_times=[self.ensemble_num, 1])
        norm_score_matrix = score_matrix / sum_critic_score

        actions_mean_score = layers.reduce_mean(
            norm_score_matrix, dim=1, keep_dim=True)
        best_score_id = layers.argmax(actions_mean_score, axis=0)
        best_score_id = layers.cast(best_score_id, dtype='int32')
        ensemble_predict_action = layers.gather(batch_actions, best_score_id)
        ensemble_predict_action = layers.squeeze(
            ensemble_predict_action, axes=[0])
        return ensemble_predict_action
Ejemplo n.º 5
0
    def define_learn(self, obs, action, reward, next_obs, terminal, weight):
        #Q(s,a|θ)
        pred_value = self.model.value(obs)
        #Q(s',a'|θ')
        targetQ_predict_value = self.target_model.value(next_obs)
        #Q(s',a'|θ)
        next_s_predcit_value = self.model.value(next_obs)
        #argMax[Q(s',a'|θ)]
        greedy_action = fluid_argmax(next_s_predcit_value)
        predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
        #Q(s',argMax[Q(s',a'|θ)]|θ')
        best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul(
            predict_onehot, targetQ_predict_value),
                                         dim=1)
        best_v.stop_gradient = True
        #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ')
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.action_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        #计算新的TD-Error
        newTd = layers.abs(target - pred_action_value)
        cost = layers.square_error_cost(pred_action_value, target)
        #weight表示样本的权重,影响cost的更新幅度
        cost = weight * cost
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, newTd
Ejemplo n.º 6
0
    def learn(self, obs, action, reward, next_obs, terminal):
        """ 使用DQN算法更新self.model的value网络
        """
        # 从target_model中获取 max Q' 的值,用于计算target_Q
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True  # 阻止梯度传递
        terminal = layers.cast(terminal, dtype='float32')
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)  # 获取Q预测值
        # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        # 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
        # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
        #  ==> pred_action_value = [[3.9]]
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # 计算 Q(s,a) 与 target_Q的均方差,得到loss
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 7
0
    def infer_onestep(self, inputs):
        """inference the gru-unit by one step"""
        prev_hidden = inputs['prev_hidden']
        first_step_mask = inputs['first_step_mask']
        item_embedding = self._build_embeddings(
            inputs, self.item_slot_names)  # (b*cand_len, dim), as candidates
        last_click_embedding = self._build_embeddings(
            inputs, self.last_click_slot_names)  # (b, dim)
        last_item_embedding = self._build_embeddings(
            inputs, self.last_item_slot_names)  # (b, dim)

        item_fc = self.item_fc_op(item_embedding)
        last_item_fc = self.item_fc_op(last_item_embedding) * first_step_mask
        item_hidden = self.simple_step_rnn(last_item_fc,
                                           last_click_embedding,
                                           h_0=prev_hidden)
        action_hat = self.actor_policy(item_hidden)

        # inner product
        expand_action_hat = layers.sequence_expand(
            action_hat, item_fc)  # (b*cand_len, dim)
        scores = layers.reduce_sum(expand_action_hat * item_fc, 1)

        output_dict = OrderedDict()
        output_dict['hidden'] = item_hidden
        output_dict['scores'] = scores
        return output_dict
Ejemplo n.º 8
0
    def sampling_rnn(self,
                     item_fc,
                     h_0,
                     pos_embed,
                     forward_func,
                     sampling_type,
                     eps=0,
                     eta=1):
        mask = layers.reduce_sum(item_fc, dim=1, keep_dim=True) * 0 + 1
        drnn = fluid.layers.DynamicRNN()
        with drnn.block():
            # e.g. batch_size = 2
            _ = drnn.step_input(item_fc)
            cur_pos_embed = drnn.step_input(pos_embed)  # lod = []
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)  # lod = [0,1,2]
            item_fc = drnn.static_input(item_fc)
            mask = drnn.memory(init=mask, need_reorder=True)

            # step_input will remove lod info
            cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0)

            # expand
            expand_h_0 = layers.sequence_expand(
                cur_h_0, item_fc)  # lod = [0,1,2,3,4,5,6,7]
            expand_pos_embed = layers.sequence_expand(
                cur_pos_embed, item_fc)  # lod = [0,1,2,3,4,5,6,7]
            expand_item_fc = layers.lod_reset(item_fc, expand_h_0)
            # forward
            expand_next_h_0, expand_scores = forward_func(
                expand_item_fc, expand_h_0, expand_pos_embed)
            # reset result lod
            expand_next_h_0 = layers.lod_reset(expand_next_h_0,
                                               item_fc)  # lod = [0,4,7]
            expand_scores = layers.lod_reset(expand_scores,
                                             item_fc)  # lod = [0,4,7]

            if sampling_type == 'eps_greedy':
                selected_index = self.eps_greedy_sampling(expand_scores,
                                                          mask,
                                                          eps=eps)
            elif sampling_type == 'softmax':
                selected_index = self.softmax_sampling(expand_scores,
                                                       mask,
                                                       eta=eta)

            drnn.output(selected_index)

            next_h_0 = fluid_sequence_index(expand_next_h_0, selected_index)
            next_mask = fluid_sequence_scatter(
                mask, layers.reshape(selected_index, [-1]), 0.0)

            # update
            drnn.update_memory(cur_h_0, next_h_0)
            drnn.update_memory(mask, next_mask)

        drnn_output = drnn()
        return drnn_output
Ejemplo n.º 9
0
    def train_rnn(self,
                  item_fc,
                  atten_item_fc,
                  h_0,
                  pos,
                  pos_embed,
                  output_type=''):
        shifted_item_fc = fluid_sequence_advance(item_fc, OOV=0)
        drnn = fluid.layers.DynamicRNN()
        with drnn.block():
            cur_item_fc = drnn.step_input(shifted_item_fc)
            cur_pos_embed = drnn.step_input(pos_embed)
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)

            # step_input will remove lod info
            cur_item_fc = layers.lod_reset(cur_item_fc, cur_h_0)
            cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0)

            next_h_0, hidden_fc = self.sampling_rnn_forward(
                cur_item_fc, cur_h_0, cur_pos_embed)

            if output_type == 'c_Q':
                cur_atten_item_fc = drnn.step_input(atten_item_fc)
                cur_atten_item_fc = layers.lod_reset(cur_atten_item_fc,
                                                     cur_h_0)

                Q = layers.reduce_sum(hidden_fc * cur_atten_item_fc,
                                      dim=1,
                                      keep_dim=True)
                drnn.output(Q)

            elif output_type == 'max_Q':
                cur_pos = drnn.step_input(pos)
                pos = drnn.static_input(pos)
                atten_item_fc = drnn.static_input(atten_item_fc)

                expand_Q = self._dot_attention(hidden_fc, atten_item_fc)

                cur_step_id = layers.slice(cur_pos,
                                           axes=[0, 1],
                                           starts=[0, 0],
                                           ends=[1, 1])
                mask = layers.cast(pos >= cur_step_id, 'float32')
                expand_Q = expand_Q * mask
                max_Q = layers.sequence_pool(expand_Q, 'max')
                drnn.output(max_Q)

            else:
                raise NotImplementedError(output_type)

            # update
            drnn.update_memory(cur_h_0, next_h_0)

        drnn_output = drnn()
        return drnn_output
Ejemplo n.º 10
0
 def _build_embeddings(self, inputs, list_names):
     list_embed = []
     for name in list_names:
         embed_name = self._get_embed_name(name)
         c_embed = self.dict_data_embed_op[embed_name](inputs[name])
         if len(c_embed.shape) == 3:                             # squeeze (batch*num_items, None, 16)
             c_embed = layers.reduce_sum(c_embed, dim=1)
         list_embed.append(c_embed)                              # (batch*num_items, 16)
     concated_embed = layers.concat(input=list_embed, axis=1)    # (batch*num_items, concat_dim)
     concated_embed = layers.softsign(concated_embed)
     return concated_embed
Ejemplo n.º 11
0
    def learn(self, obs, action, reward):
        act_prob = self.model(obs)
        # log_prob = layers.cross_entropy(act_prob, action)
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) * layers.one_hot(
                action, act_prob.shape[1]),
            dim=1)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 12
0
    def sampling_rnn(self,
                     item_fc,
                     atten_item_fc,
                     h_0,
                     pos_embed,
                     sampling_type,
                     eps=0,
                     eta=1):
        oov_item_fc = layers.fill_constant_batch_size_like(item_fc,
                                                           shape=item_fc.shape,
                                                           value=0,
                                                           dtype='float32')
        oov_item_fc = layers.lod_reset(oov_item_fc, h_0)
        mask = layers.reduce_sum(item_fc, dim=1, keep_dim=True) * 0 + 1
        drnn = fluid.layers.DynamicRNN()
        with drnn.block():
            _ = drnn.step_input(item_fc)
            cur_pos_embed = drnn.step_input(pos_embed)
            cur_item_fc = drnn.memory(init=oov_item_fc, need_reorder=True)
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)
            mask = drnn.memory(init=mask, need_reorder=True)
            item_fc = drnn.static_input(item_fc)
            atten_item_fc = drnn.static_input(atten_item_fc)

            # step_input will remove lod info
            cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0)

            next_h_0, hidden_fc = self.sampling_rnn_forward(
                cur_item_fc, cur_h_0, cur_pos_embed)
            expand_Q = self._dot_attention(hidden_fc, atten_item_fc)

            if sampling_type == 'eps_greedy':
                selected_index = self.eps_greedy_sampling(expand_Q,
                                                          mask,
                                                          eps=eps)
            elif sampling_type == 'softmax':
                selected_index = self.softmax_sampling(expand_Q, mask, eta=eta)

            drnn.output(selected_index)

            next_item_fc = fluid_sequence_index(item_fc, selected_index)
            next_mask = fluid_sequence_scatter(
                mask, layers.reshape(selected_index, [-1]), 0.0)

            # update
            drnn.update_memory(cur_item_fc, next_item_fc)
            drnn.update_memory(cur_h_0, next_h_0)
            drnn.update_memory(mask, next_mask)

        drnn_output = drnn()
        return drnn_output
Ejemplo n.º 13
0
 def learn(self, obs, action, reward):
     """ 用policy gradient 算法更新policy model
     """
     act_prob = self.model(obs)  # 获取输出动作概率
     # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
     log_prob = layers.reduce_sum(-1.0 * layers.log(act_prob) *
                                  layers.one_hot(action, act_prob.shape[1]),
                                  dim=1)
     cost = log_prob * reward
     cost = layers.reduce_mean(cost)
     print('====loss', cost)
     optimizer = fluid.optimizer.Adam(self.lr)
     optimizer.minimize(cost)
     return cost
Ejemplo n.º 14
0
    def __call__(self, input_Q, input_K, input_V, num_head, mask):
        """
        args:
            input_Q: (batch, max_num0, dim0)
            input_K: (batch, max_num1, dim1)
            input_V: (batch, max_num1, dim2)
            mask: (batch, max_num0, max_num1)
        returns:
            output: (batch, max_num0, dim)
        """
        # fcs
        Q = self.Q_fc_op(input_Q)
        K = self.K_fc_op(input_K)
        V = self.V_fc_op(input_V)

        # multi head
        dim = Q.shape[-1]
        assert dim % num_head == 0, (dim, num_head)
        list_output = []
        sub_Qs = fluid_split(Q, num_head, 2)
        sub_Ks = fluid_split(K, num_head, 2)
        sub_Vs = fluid_split(V, num_head, 2)
        for head_id in range(num_head):
            sub_Q = sub_Qs[head_id]  # (batch, max_num, dim/num_head)
            sub_K = sub_Ks[head_id]
            sub_V = sub_Vs[head_id]
            # matmul -> scale -> mask -> softmax -> mask -> /sum
            Q_K_T = layers.matmul(sub_Q, layers.transpose(
                sub_K, perm=[0, 2, 1]))  # (batch, max_num0, max_num1)
            Q_K_T = Q_K_T / np.sqrt(self._nf)
            Q_K_T = Q_K_T * mask

            Q_K_T = layers.softmax(Q_K_T)
            Q_K_T = Q_K_T * mask

            Q_K_T = Q_K_T / (layers.reduce_sum(Q_K_T, dim=2, keep_dim=True) +
                             self._safe_eps)

            # weighted sum
            atten_out = layers.matmul(Q_K_T,
                                      sub_V)  # (batch, max_num0, dim/num_head)
            list_output.append(atten_out)
        output = layers.concat(list_output, 2)
        return output
Ejemplo n.º 15
0
    def learn(self, obs, action, reward, next_obs, terminal):
        '''
        :param obs: St
        :param action: At
        :param reward: Rt+1
        :param next_obs: St+1
        :param terminal: done, True代表episode结束
        :return: 损失函数的值
        '''

        # 通过目标网络计算得到target_Q的值
        target_Q_tensor = self.target_model.value(next_obs)  # 计算St+1对应的价值向量
        max_Q = layers.reduce_max(target_Q_tensor, dim=1)  # 获取每行的最大值,按dim=1收缩
        max_Q.stop_gradient = True  # 停止梯度更新

        # 由于terminal不是标量,所以不能直接用判断
        terminal = layers.cast(terminal, dtype="float32")
        target_Q = reward + (1.0 - terminal) * self.gamma * max_Q

        # 通过主网络计算得到perdict_Q的值
        predict_Q_tensor = self.model.value(obs)
        # 将action转成one-hot向量,并将每一位都变成浮点数
        action_onehot = layers.one_hot(action, self.act_dim)
        action = layers.cast(action_onehot, dtype="float32")
        # 进行elementwise计算并降低张量阶数
        # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4],  action_onehot=[[0, 0, 0, 1, 0]
        #                         [2.1, 3.7, 4.5, 6.7, 7.1]]                 [0, 1, 0, 0, 0]]
        # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0]
        #                               [0, 3.7, 0, 0, 0]]
        # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7]
        predict_Q = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, predict_Q_tensor),
                                      dim=1)

        # 得到这个batch每条数据的损失函数值的平均值
        cost = layers.square_error_cost(predict_Q, target_Q)
        cost = layers.reduce_mean(cost)

        # 申明优化器(使用Adam优化器)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)  # 指定优化目标

        return cost
Ejemplo n.º 16
0
    def learn(self, obs, action, reward, next_obs, terminal):
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=-1)
        best_v.stop_gradient = True
        terminal = layers.cast(terminal, dtype="float32")
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype="float32")
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            pred_value, action_onehot),
                                              dim=-1)

        cost = layers.square_error_cost(target, pred_action_value)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 17
0
 def _dot_attention(self, input, atten_items):
     """
     args:
         input: (batch, dim), lod_level = 0
         atten_items: (batch*seq_len, dim), lod_level = 1
     return:
         atten_weights: (batch*seq_len, 1), lod_level = 1
     """
     expand_input = layers.sequence_expand(
         input, atten_items)  #(batch*seq_len, dim), lod_level = 0
     expand_input = layers.lod_reset(
         expand_input, atten_items)  #(batch*seq_len, dim), lod_level = 1
     if self._attention_type == 'concat_fc':
         atten_weights = self.atten_fc_op(
             layers.concat([expand_input, atten_items], 1))
     elif self._attention_type == 'dot':
         atten_weights = layers.reduce_sum(
             expand_input * atten_items, dim=1,
             keep_dim=True)  #(batch*seq_len, 1), lod_level = 1
     return atten_weights
Ejemplo n.º 18
0
    def learn(self, obs, action, reward):
        """

        :param obs:      [B,4]
        :param action:   [B,1]
        :param reward:   [B,]
        :return:
        """
        act_prob = self.model(obs)  # [B,2]
        # [B, 2] -> [B, ]
        log_prob = layers.reduce_sum(
            -1.0 * layers.log(act_prob) *
            layers.one_hot(action, depth=act_prob.shape[1]),
            dim=1,
            keep_dim=False)
        cost = log_prob * reward
        cost = layers.reduce_mean(cost)

        optimizer = fluid.optimizer.Adam(self.lr)
        optimizer.minimize(cost)
        return cost
Ejemplo n.º 19
0
    def ensemble_predict(self, obs):
        """ ensemble predict:
        1. For actions of all actors, each critic will score them
           and normalize its scores;
        2. For each actor, will calculate its score by 
           average scores given by all critics
        3. choose action of the actor whose score is best
        """
        actor_outputs = []
        for i in range(self.ensemble_num):
            actor_outputs.append(self.models[i].policy(obs))
        batch_actions = layers.concat(actor_outputs, axis=0)
        batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])

        critic_outputs = []
        for i in range(self.ensemble_num):
            critic_output = self.models[i].value(batch_obs, batch_actions)
            critic_output = layers.unsqueeze(critic_output, axes=[1])
            critic_outputs.append(critic_output)
        score_matrix = layers.concat(critic_outputs, axis=1)

        # Normalize scores given by each critic
        sum_critic_score = layers.reduce_sum(score_matrix,
                                             dim=0,
                                             keep_dim=True)
        sum_critic_score = layers.expand(sum_critic_score,
                                         expand_times=[self.ensemble_num, 1])
        norm_score_matrix = score_matrix / sum_critic_score

        actions_mean_score = layers.reduce_mean(norm_score_matrix,
                                                dim=1,
                                                keep_dim=True)
        best_score_id = layers.argmax(actions_mean_score, axis=0)
        best_score_id = layers.cast(best_score_id, dtype='int32')
        ensemble_predict_action = layers.gather(batch_actions, best_score_id)
        return ensemble_predict_action