Exemple #1
0
    def softmax_sampling(self, scores, mask, eta):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def normalize(scores_padded, mask_padded):
            mean_S = layers.reduce_sum(scores_padded, dim=1,
                                       keep_dim=True) / layers.reduce_sum(
                                           mask_padded, dim=1, keep_dim=True)
            S = scores_padded - mean_S
            std_S = layers.sqrt(
                layers.reduce_sum(layers.square(S * mask_padded),
                                  dim=1,
                                  keep_dim=True))
            return S / (std_S + self.SAFE_EPS)

        norm_S = normalize(scores_padded, mask_padded)
        # set mask to large negative values
        norm_S = norm_S * mask_padded - (mask_padded *
                                         (-1) + 1) * self.BIG_VALUE
        soft_prob = layers.softmax(norm_S / eta) * mask_padded
        sampled_id = layers.reshape(layers.sampling_id(soft_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
    def learn(self, obs, action, reward, next_obs, terminal):
        """ 使用DQN算法更新self.model的value网络
        """
        # 从target_model中获取 max Q' 的值,用于计算target_Q
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=1)
        best_v.stop_gradient = True  # 阻止梯度传递
        terminal = layers.cast(terminal, dtype='float32')
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)  # 获取Q预测值
        # 将action转onehot向量,比如:3 => [0,0,0,1,0],独热编码有好处
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        # 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
        # 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
        #  ==> pred_action_value = [[3.9]]
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        # 计算 Q(s,a) 与 target_Q的均方差,得到loss
        cost = layers.square_error_cost(pred_action_value, target)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
        optimizer.minimize(cost)
        return cost
Exemple #3
0
    def define_learn(self, obs, action, reward, next_obs, terminal, weight):
        #Q(s,a|θ)
        pred_value = self.model.value(obs)
        #Q(s',a'|θ')
        targetQ_predict_value = self.target_model.value(next_obs)
        #Q(s',a'|θ)
        next_s_predcit_value = self.model.value(next_obs)
        #argMax[Q(s',a'|θ)]
        greedy_action = fluid_argmax(next_s_predcit_value)
        predict_onehot = fluid.layers.one_hot(greedy_action, self.action_dim)
        #Q(s',argMax[Q(s',a'|θ)]|θ')
        best_v = fluid.layers.reduce_sum(fluid.layers.elementwise_mul(
            predict_onehot, targetQ_predict_value),
                                         dim=1)
        best_v.stop_gradient = True
        #TD目标: R+γ*Q(s',argMax[Q(s',a'|θ)]|θ')
        target = reward + (
            1.0 - layers.cast(terminal, dtype='float32')) * self.gamma * best_v

        action_onehot = layers.one_hot(action, self.action_dim)
        action_onehot = layers.cast(action_onehot, dtype='float32')
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, pred_value),
                                              dim=1)

        #计算新的TD-Error
        newTd = layers.abs(target - pred_action_value)
        cost = layers.square_error_cost(pred_action_value, target)
        #weight表示样本的权重,影响cost的更新幅度
        cost = weight * cost
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(self.lr, epsilon=1e-3)
        optimizer.minimize(cost)
        return cost, newTd
Exemple #4
0
    def eps_greedy_sampling(self, scores, mask, eps):
        scores = scores * mask
        scores_padded = layers.squeeze(
            fluid_sequence_pad(scores, 0, maxlen=128),
            [2])  # (b*s, 1) -> (b, s, 1) -> (b, s)
        mask_padded = layers.squeeze(fluid_sequence_pad(mask, 0, maxlen=128),
                                     [2])
        seq_lens = fluid_sequence_get_seq_len(scores)

        def get_greedy_prob(scores_padded, mask_padded):
            s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE
            max_value = layers.reduce_max(s, dim=1, keep_dim=True)
            greedy_prob = layers.cast(s >= max_value, 'float32')
            return greedy_prob

        greedy_prob = get_greedy_prob(scores_padded, mask_padded)
        eps_prob = mask_padded * eps / layers.reduce_sum(
            mask_padded, dim=1, keep_dim=True)

        final_prob = (greedy_prob + eps_prob) * mask_padded
        final_prob = final_prob / layers.reduce_sum(
            final_prob, dim=1, keep_dim=True)

        sampled_id = layers.reshape(layers.sampling_id(final_prob), [-1, 1])
        max_id = layers.cast(layers.cast(seq_lens, 'float32') - 1, 'int64')
        sampled_id = layers.elementwise_min(sampled_id, max_id)
        return layers.cast(sampled_id, 'int64')
Exemple #5
0
 def _cut_by_decode_len(self, input, decode_len):
     zeros = layers.fill_constant_batch_size_like(input,
                                                  shape=[-1, 1],
                                                  value=0,
                                                  dtype='int64')
     output = layers.sequence_slice(layers.cast(input, 'float32'),
                                    offset=zeros,
                                    length=decode_len)
     return layers.cast(output, input.dtype)
    def dynamic_rnn(self, item_fc, h_0, output_type=None, double_type=None, double_id=None):
        drnn = fluid.layers.DynamicRNN()
        pos = fluid_sequence_get_pos(item_fc)
        with drnn.block():
            cur_item_fc = drnn.step_input(item_fc)
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)

            cur_item_fc = layers.lod_reset(cur_item_fc, cur_h_0)
            next_h_0 = self.simple_step_rnn(cur_item_fc, h_0=cur_h_0)

            if output_type == 'c_Q':
                Q = self.out_Q_fc2_op(self.out_Q_fc1_op(next_h_0))
                drnn.output(Q)

            elif output_type in ['max_Q', 'double_Q']:
                # batch_size = 2
                # item_fc: lod = [0,4,7]
                # cur_h_0: lod = [0,1,2]
                item_fc = drnn.static_input(item_fc)
                pos = drnn.static_input(pos)
                cur_step = drnn.memory(shape=[1], dtype='int64', value=0)

                expand_h_0 = layers.sequence_expand(cur_h_0, item_fc)               # lod = [0,1,2,3,4,5,6,7]
                new_item_fc = layers.lod_reset(item_fc, expand_h_0)                 # lod = [0,1,2,3,4,5,6,7]
                next_expand_h_0 = self.simple_step_rnn(new_item_fc, expand_h_0)     # lod = [0,1,2,3,4,5,6,7]
                next_expand_h_0 = layers.lod_reset(next_expand_h_0, item_fc)        # lod = [0,4,7]

                expand_Q = self.out_Q_fc2_op(self.out_Q_fc1_op(next_expand_h_0))
                cur_step_id = layers.slice(cur_step, axes=[0, 1], starts=[0, 0], ends=[1, 1])
                mask = layers.cast(pos >= cur_step_id, 'float32')
                expand_Q = expand_Q * mask

                if output_type == 'max_Q':
                    max_Q = layers.sequence_pool(expand_Q, 'max')                       # lod = [0,1,2]
                    drnn.output(max_Q)
                elif output_type == 'double_Q':
                    if double_type == 'max_id':
                        max_id = self.eps_greedy_sampling(expand_Q, mask, eps=0)
                        drnn.output(max_id)
                    elif double_type == 'double_Q':
                        cur_double_id = drnn.step_input(double_id)

                        double_Q = fluid_sequence_index(expand_Q, cur_double_id)
                        drnn.output(double_Q)

                # update
                next_step = cur_step + 1
                drnn.update_memory(cur_step, next_step)

            elif output_type == 'hidden':
                drnn.output(next_h_0)                

            else:
                raise NotImplementedError(output_type)

            # update
            drnn.update_memory(cur_h_0, next_h_0)

        drnn_output = drnn()
        return drnn_output
Exemple #7
0
    def _ensemble_predict(self, obs):
        actor_outputs = []
        for i in range(self.ensemble_num):
            actor_outputs.append(self.actors[i].predict(obs))
        batch_actions = layers.concat(actor_outputs, axis=0)
        batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])

        critic_outputs = []
        for i in range(self.ensemble_num):
            critic_output = self.critics[i].predict(batch_obs, batch_actions)
            critic_output = layers.unsqueeze(critic_output, axes=[1])
            critic_outputs.append(critic_output)
        score_matrix = layers.concat(critic_outputs, axis=1)

        # Normalize scores given by each critic
        sum_critic_score = layers.reduce_sum(
            score_matrix, dim=0, keep_dim=True)
        sum_critic_score = layers.expand(
            sum_critic_score, expand_times=[self.ensemble_num, 1])
        norm_score_matrix = score_matrix / sum_critic_score

        actions_mean_score = layers.reduce_mean(
            norm_score_matrix, dim=1, keep_dim=True)
        best_score_id = layers.argmax(actions_mean_score, axis=0)
        best_score_id = layers.cast(best_score_id, dtype='int32')
        ensemble_predict_action = layers.gather(batch_actions, best_score_id)
        ensemble_predict_action = layers.squeeze(
            ensemble_predict_action, axes=[0])
        return ensemble_predict_action
Exemple #8
0
    def learn(self, obs, action, reward, next_obs, terminal):
        next_pred_value = self.target_model.value(next_obs)
        best_v = layers.reduce_max(next_pred_value, dim=-1)
        best_v.stop_gradient = True
        terminal = layers.cast(terminal, dtype="float32")
        target = reward + (1.0 - terminal) * self.gamma * best_v

        pred_value = self.model.value(obs)
        action_onehot = layers.one_hot(action, self.act_dim)
        action_onehot = layers.cast(action_onehot, dtype="float32")
        pred_action_value = layers.reduce_sum(layers.elementwise_mul(
            pred_value, action_onehot),
                                              dim=-1)

        cost = layers.square_error_cost(target, pred_action_value)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)
        return cost
Exemple #9
0
    def learn(self, obs, action, reward, next_obs, terminal):
        '''
        :param obs: St
        :param action: At
        :param reward: Rt+1
        :param next_obs: St+1
        :param terminal: done, True代表episode结束
        :return: 损失函数的值
        '''

        # 通过目标网络计算得到target_Q的值
        target_Q_tensor = self.target_model.value(next_obs)  # 计算St+1对应的价值向量
        max_Q = layers.reduce_max(target_Q_tensor, dim=1)  # 获取每行的最大值,按dim=1收缩
        max_Q.stop_gradient = True  # 停止梯度更新

        # 由于terminal不是标量,所以不能直接用判断
        terminal = layers.cast(terminal, dtype="float32")
        target_Q = reward + (1.0 - terminal) * self.gamma * max_Q

        # 通过主网络计算得到perdict_Q的值
        predict_Q_tensor = self.model.value(obs)
        # 将action转成one-hot向量,并将每一位都变成浮点数
        action_onehot = layers.one_hot(action, self.act_dim)
        action = layers.cast(action_onehot, dtype="float32")
        # 进行elementwise计算并降低张量阶数
        # 比如 predict_Q_tensor = [[2.3, 5.7, 1.2, 3.9, 1.4],  action_onehot=[[0, 0, 0, 1, 0]
        #                         [2.1, 3.7, 4.5, 6.7, 7.1]]                 [0, 1, 0, 0, 0]]
        # 那么elementwise乘法运算后的结果是 [[0, 0, 0, 3.9, 0]
        #                               [0, 3.7, 0, 0, 0]]
        # 再进行dim=1的reduce_sum操作后的结果是 [3.9, 3.7]
        predict_Q = layers.reduce_sum(layers.elementwise_mul(
            action_onehot, predict_Q_tensor),
                                      dim=1)

        # 得到这个batch每条数据的损失函数值的平均值
        cost = layers.square_error_cost(predict_Q, target_Q)
        cost = layers.reduce_mean(cost)

        # 申明优化器(使用Adam优化器)
        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)
        optimizer.minimize(cost)  # 指定优化目标

        return cost
Exemple #10
0
    def train_rnn(self,
                  item_fc,
                  atten_item_fc,
                  h_0,
                  pos,
                  pos_embed,
                  output_type=''):
        shifted_item_fc = fluid_sequence_advance(item_fc, OOV=0)
        drnn = fluid.layers.DynamicRNN()
        with drnn.block():
            cur_item_fc = drnn.step_input(shifted_item_fc)
            cur_pos_embed = drnn.step_input(pos_embed)
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)

            # step_input will remove lod info
            cur_item_fc = layers.lod_reset(cur_item_fc, cur_h_0)
            cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0)

            next_h_0, hidden_fc = self.sampling_rnn_forward(
                cur_item_fc, cur_h_0, cur_pos_embed)

            if output_type == 'c_Q':
                cur_atten_item_fc = drnn.step_input(atten_item_fc)
                cur_atten_item_fc = layers.lod_reset(cur_atten_item_fc,
                                                     cur_h_0)

                Q = layers.reduce_sum(hidden_fc * cur_atten_item_fc,
                                      dim=1,
                                      keep_dim=True)
                drnn.output(Q)

            elif output_type == 'max_Q':
                cur_pos = drnn.step_input(pos)
                pos = drnn.static_input(pos)
                atten_item_fc = drnn.static_input(atten_item_fc)

                expand_Q = self._dot_attention(hidden_fc, atten_item_fc)

                cur_step_id = layers.slice(cur_pos,
                                           axes=[0, 1],
                                           starts=[0, 0],
                                           ends=[1, 1])
                mask = layers.cast(pos >= cur_step_id, 'float32')
                expand_Q = expand_Q * mask
                max_Q = layers.sequence_pool(expand_Q, 'max')
                drnn.output(max_Q)

            else:
                raise NotImplementedError(output_type)

            # update
            drnn.update_memory(cur_h_0, next_h_0)

        drnn_output = drnn()
        return drnn_output
Exemple #11
0
    def train_rnn(self, item_fc, h_0, pos, pos_embed, output_type=''):
        drnn = fluid.layers.DynamicRNN()
        with drnn.block():
            cur_item_fc = drnn.step_input(item_fc)
            cur_pos_embed = drnn.step_input(pos_embed)
            cur_h_0 = drnn.memory(init=h_0, need_reorder=True)

            # step_input will remove lod info
            cur_item_fc = layers.lod_reset(cur_item_fc, cur_h_0)
            cur_pos_embed = layers.lod_reset(cur_pos_embed, cur_h_0)

            next_h_0, Q = self.sampling_rnn_forward(cur_item_fc, cur_h_0,
                                                    cur_pos_embed)

            if output_type == 'c_Q':
                drnn.output(Q)

            elif output_type == 'max_Q':
                # e.g. batch_size = 2
                # cur_h_0: lod = [0,1,2]
                cur_pos = drnn.step_input(pos)
                pos = drnn.static_input(pos)  # lod = [0,4,7]
                item_fc = drnn.static_input(item_fc)  # lod = [0,4,7]

                # expand
                expand_h_0 = layers.sequence_expand(
                    cur_h_0, item_fc)  # lod = [0,1,2,3,4,5,6,7]
                expand_pos_embed = layers.sequence_expand(
                    cur_pos_embed, item_fc)  # lod = [0,1,2,3,4,5,6,7]
                expand_item_fc = layers.lod_reset(item_fc, expand_h_0)
                # forward
                _, expand_scores = self.sampling_rnn_forward(
                    expand_item_fc, expand_h_0, expand_pos_embed)
                # reset result lod
                expand_Q = layers.lod_reset(expand_scores,
                                            item_fc)  # lod = [0,4,7]

                cur_step_id = layers.slice(cur_pos,
                                           axes=[0, 1],
                                           starts=[0, 0],
                                           ends=[1, 1])
                mask = layers.cast(pos >= cur_step_id, 'float32')
                expand_Q = expand_Q * mask
                max_Q = layers.sequence_pool(expand_Q, 'max')  # lod = [0,1,2]
                drnn.output(max_Q)

            else:
                raise NotImplementedError(output_type)

            # update
            drnn.update_memory(cur_h_0, next_h_0)

        drnn_output = drnn()
        return drnn_output
Exemple #12
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)
        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
        optimizer.minimize(cost)
        return cost
Exemple #13
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        output_dict = self.model.forward(inputs, mode='train')

        total_loss = 0
        if 'click' in self._output_type:
            click_id = inputs['click_id']
            click_prob = output_dict['click_prob']
            click_loss = layers.reduce_mean(
                layers.cross_entropy(input=click_prob, label=click_id))
            total_loss += click_loss
        if 'credit' in self._output_type:
            credit = inputs['credit'] * self._credit_scale
            credit_pred = output_dict['credit_pred']
            credit_loss = layers.reduce_mean(
                layers.square_error_cost(input=credit_pred, label=credit))
            total_loss += credit_loss
        if 'rate' in self._output_type:
            rate = layers.cast(inputs['click_id'],
                               'float32') * self._rate_scale
            rate_pred = output_dict['rate_pred']
            rate_loss = layers.reduce_mean(
                layers.square_error_cost(input=rate_pred, label=rate))
            total_loss += rate_loss

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr,
                                             epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(total_loss)

        fetch_dict = OrderedDict()
        fetch_dict[
            'loss'] = total_loss  # don't rename 'loss', which will be used in parallel exe in computational task
        if 'click' in self._output_type:
            fetch_dict['click_prob'] = click_prob
            fetch_dict['click_id'] = click_id
            fetch_dict['click_loss'] = click_loss
        if 'credit' in self._output_type:
            fetch_dict['credit_pred'] = credit_pred / self._credit_scale
            fetch_dict['credit'] = credit / self._credit_scale
            fetch_dict['credit_loss'] = credit_loss
        if 'rate' in self._output_type:
            fetch_dict['rate_pred'] = rate_pred / self._rate_scale
            fetch_dict['rate'] = rate / self._rate_scale
            fetch_dict['rate_loss'] = rate_loss
        return {'fetch_dict': fetch_dict}
Exemple #14
0
    def test(self):
        """test"""
        inputs = self.model.create_inputs(mode='train')
        click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale

        output_dict = self.model.forward(inputs, output_type='c_Q')
        c_Q = output_dict['Q']
        target_Q = self.get_target_Q(inputs, click_id)
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss
        fetch_dict['c_Q'] = c_Q / self._reward_scale
        fetch_dict['click_id'] = click_id / self._reward_scale
        return {'fetch_dict': fetch_dict}
Exemple #15
0
    def test(self):
        """test"""
        inputs = self.model.create_inputs(mode='train')
        reward = layers.cast(inputs['reward'], 'float32')

        c_Q = self.model.forward(inputs, output_type='c_Q')
        max_Q = self.target_model.forward(inputs, output_type='max_Q')
        target_Q = self.get_target_Q(max_Q, reward)

        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss
        fetch_dict['c_Q'] = c_Q
        fetch_dict['reward'] = reward
        return {'fetch_dict': fetch_dict}
Exemple #16
0
    def _critic_learn(self, obs, action, reward, next_obs, terminal):
        next_action = self.target_model.policy(next_obs)
        next_Q = self.target_model.value(next_obs, next_action)

        terminal = layers.cast(terminal, dtype='float32')
        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
        target_Q.stop_gradient = True

        Q = self.model.value(obs, action)
        cost = layers.square_error_cost(Q, target_Q)
        cost = layers.reduce_mean(cost)

        # optimizer = fluid.optimizer.AdamOptimizer(self.critic_lrvalue)
        optimizer = fluid.optimizer.AdamOptimizer(
            learning_rate=fluid.layers.piecewise_decay(
                boundaries=self.boundaries, values=self.critic_lrvalue),
            regularization=fluid.regularizer.L2Decay(1e-4))

        optimizer.minimize(cost)
        return cost
Exemple #17
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        reward = layers.cast(inputs['reward'], 'float32')

        c_Q = self.model.forward(inputs, output_type='c_Q')
        max_Q = self.target_model.forward(inputs, output_type='max_Q')
        target_Q = self.get_target_Q(max_Q, reward)
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(loss)

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss             # don't rename 'loss', which will be used in parallel exe in computational task
        fetch_dict['c_Q'] = c_Q
        fetch_dict['reward'] = reward
        return {'fetch_dict': fetch_dict}
Exemple #18
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale

        def train_actor(inputs):
            output_dict = self.model.forward(inputs, output_type='max_Q')
            max_Q = output_dict['Q']
            actor_loss = layers.reduce_mean(-1.0 * max_Q)
            actor_lr = self.lr * 0.1    # actor lr should be smaller than critic lr, so critic can learn faster
            if self.optimizer == 'Adam':
                optimizer = fluid.optimizer.Adam(learning_rate=actor_lr, epsilon=1e-4)
            elif self.optimizer == 'SGD':
                optimizer = fluid.optimizer.SGD(learning_rate=actor_lr)
            optimizer.minimize(actor_loss, parameter_list=self.model.actor_param_names)
            return actor_loss

        def train_critic(inputs, click_id):
            output_dict = self.model.forward(inputs, output_type='c_Q')
            c_Q = output_dict['Q']
            target_Q = self.get_target_Q(inputs, click_id)
            target_Q.stop_gradient = True
            critic_loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))
            if self.optimizer == 'Adam':
                optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
            elif self.optimizer == 'SGD':
                optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
            optimizer.minimize(critic_loss)
            return critic_loss

        actor_loss = train_actor(inputs)
        critic_loss = train_critic(inputs, click_id)
        loss = actor_loss + critic_loss

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss             # don't rename 'loss', which will be used in parallel exe in computational task
        fetch_dict['actor_loss'] = actor_loss
        fetch_dict['critic_loss'] = critic_loss
        # fetch_dict['click_id'] = click_id / self._reward_scale
        return {'fetch_dict': fetch_dict}
Exemple #19
0
    def train(self):
        """train"""
        inputs = self.model.create_inputs(mode='train')
        click_id = layers.cast(inputs['click_id'], 'float32') * self._reward_scale

        output_dict = self.model.forward(inputs, output_type='c_Q')
        c_Q = output_dict['Q']
        target_Q = self.get_target_Q(inputs, click_id)
        target_Q.stop_gradient = True
        loss = layers.reduce_mean(layers.square_error_cost(c_Q, target_Q))

        if self.optimizer == 'Adam':
            optimizer = fluid.optimizer.Adam(learning_rate=self.lr, epsilon=1e-4)
        elif self.optimizer == 'SGD':
            optimizer = fluid.optimizer.SGD(learning_rate=self.lr)
        optimizer.minimize(loss)

        fetch_dict = OrderedDict()
        fetch_dict['loss'] = loss             # don't rename 'loss', which will be used in parallel exe in computational task
        fetch_dict['c_Q'] = c_Q / self._reward_scale
        fetch_dict['click_id'] = click_id / self._reward_scale
        return {'fetch_dict': fetch_dict}
Exemple #20
0
    def test(self):
        """test"""
        inputs = self.model.create_inputs(mode='test')
        output_dict = self.model.forward(inputs, mode='test')

        fetch_dict = OrderedDict()
        if 'click' in self._output_type:
            fetch_dict['click_prob'] = output_dict['click_prob']
            fetch_dict['click_id'] = inputs['click_id'] + layers.reduce_mean(
                output_dict['click_prob']
            ) * 0  # IMPORTANT!!! equals to label = label, otherwise parallel executor won't get this variable
        if 'credit' in self._output_type:
            fetch_dict['credit_pred'] = output_dict[
                'credit_pred'] / self._credit_scale
            fetch_dict['credit'] = inputs['credit'] + layers.reduce_mean(
                output_dict['credit_pred']) * 0
        if 'rate' in self._output_type:
            fetch_dict[
                'rate_pred'] = output_dict['rate_pred'] / self._rate_scale
            fetch_dict['rate'] = layers.cast(inputs['click_id'], 'float32') \
                                 + layers.reduce_mean(output_dict['rate_pred']) * 0
        return {'fetch_dict': fetch_dict}
Exemple #21
0
    def ensemble_predict(self, obs):
        """ ensemble predict:
        1. For actions of all actors, each critic will score them
           and normalize its scores;
        2. For each actor, will calculate its score by 
           average scores given by all critics
        3. choose action of the actor whose score is best
        """
        actor_outputs = []
        for i in range(self.ensemble_num):
            actor_outputs.append(self.models[i].policy(obs))
        batch_actions = layers.concat(actor_outputs, axis=0)
        batch_obs = layers.expand(obs, expand_times=[self.ensemble_num, 1])

        critic_outputs = []
        for i in range(self.ensemble_num):
            critic_output = self.models[i].value(batch_obs, batch_actions)
            critic_output = layers.unsqueeze(critic_output, axes=[1])
            critic_outputs.append(critic_output)
        score_matrix = layers.concat(critic_outputs, axis=1)

        # Normalize scores given by each critic
        sum_critic_score = layers.reduce_sum(score_matrix,
                                             dim=0,
                                             keep_dim=True)
        sum_critic_score = layers.expand(sum_critic_score,
                                         expand_times=[self.ensemble_num, 1])
        norm_score_matrix = score_matrix / sum_critic_score

        actions_mean_score = layers.reduce_mean(norm_score_matrix,
                                                dim=1,
                                                keep_dim=True)
        best_score_id = layers.argmax(actions_mean_score, axis=0)
        best_score_id = layers.cast(best_score_id, dtype='int32')
        ensemble_predict_action = layers.gather(batch_actions, best_score_id)
        return ensemble_predict_action
    def test_param_sharing(self):
        """
        Test case for parameter sharing between layers of the same type
        """
        net = MyNetWork()
        ## we bind the paras of embedding to those of fc1
        batch_size = 10
        dict_size = 100
        input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
        input_x = np.random.randint(dict_size,
                                    size=(batch_size, 1)).astype("int64")
        #################################

        main_program1 = fluid.Program()
        with fluid.program_guard(main_program1):
            x = layers.data(name='x', shape=[100], dtype="float32")
            y1 = net.fc1(input=x)
            y11 = net.fc1(input=x)
            y2 = net.fc2(input=x)
            y3 = net.fc3(input=x)
            y4 = net.fc4(input=x)

        main_program2 = fluid.Program()
        with fluid.program_guard(main_program2):
            x_ = layers.data(name='x', shape=[1], dtype="int64")
            cx_ = layers.cast(x=layers.one_hot(input=x_, depth=dict_size),
                              dtype="float32")
            y1_ = net.fc1(input=cx_)
            y2_ = net.embedding(input=x_)

            x1_ = layers.data(name='x1', shape=[100], dtype="float32")
            y3_ = net.fc1(input=x1_)

        #### we run the startup program only once to make sure
        #### only one para init across the two programs
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        ######################################################

        outputs = exe.run(main_program1,
                          feed={"x": input_cx},
                          fetch_list=[y1, y11, y2, y3, y4])
        old_y1 = outputs[0]
        self.assertEqual(np.sum(outputs[0].flatten()),
                         np.sum(outputs[1].flatten()))
        self.assertNotEqual(np.sum(outputs[1].flatten()),
                            np.sum(outputs[2].flatten()))
        self.assertNotEqual(np.sum(outputs[3].flatten()),
                            np.sum(outputs[4].flatten()))

        outputs = exe.run(main_program2,
                          feed={
                              'x': input_x,
                              'x1': input_cx
                          },
                          fetch_list=[y1_, y2_, y3_])

        ### test two different layers sharing the same para matrix
        self.assertEqual(np.sum(outputs[0].flatten()),
                         np.sum(outputs[1].flatten()))
        ### test if the same layer can have the same parameters across two different programs
        self.assertEqual(np.sum(outputs[2].flatten()),
                         np.sum(old_y1.flatten()))
Exemple #23
0
 def get_greedy_prob(scores_padded, mask_padded):
     s = scores_padded - (mask_padded * (-1) + 1) * self.BIG_VALUE
     max_value = layers.reduce_max(s, dim=1, keep_dim=True)
     greedy_prob = layers.cast(s >= max_value, 'float32')
     return greedy_prob