Exemple #1
0
    def create_mask(self, qlen, mlen):
        """
        Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.

        Args:
            qlen: Sequence length
            mlen: Mask length

        ::

                  same_length=False:      same_length=True:
                  <mlen > <  qlen >       <mlen > <  qlen >
               ^ [0 0 0 0 0 1 1 1 1]     [0 0 0 0 0 1 1 1 1]
                 [0 0 0 0 0 0 1 1 1]     [1 0 0 0 0 0 1 1 1]
            qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
                 [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
               v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]

        """
        attn_mask = paddle.ones([qlen, qlen])
        mask_up = paddle.triu(attn_mask, diagonal=1)
        attn_mask_pad = paddle.zeros([qlen, mlen])
        ret = paddle.concat([attn_mask_pad, mask_up], axis=1)
        if self.same_length:
            mask_lo = paddle.tril(attn_mask, diagonal=-1)
            ret = paddle.concat([ret[:, :qlen] + mask_lo, ret[:, qlen:]],
                                axis=1)

        return ret
Exemple #2
0
    def seq2feats(self, log_seqs, time_matrices):
        seqs = self.item_emb(log_seqs)
        seqs *= self.item_emb._embedding_dim**0.5
        seqs = self.item_emb_dropout(seqs)
        positions = paddle.arange(log_seqs.shape[1]).unsqueeze(0).expand(
            [log_seqs.shape[0], -1])
        abs_pos_K = self.abs_pos_K_emb(positions)
        abs_pos_V = self.abs_pos_V_emb(positions)
        abs_pos_K = self.abs_pos_K_emb_dropout(abs_pos_K)
        abs_pos_V = self.abs_pos_V_emb_dropout(abs_pos_V)

        time_matrix_K = self.time_matrix_K_emb(time_matrices)
        time_matrix_V = self.time_matrix_V_emb(time_matrices)
        time_matrix_K = self.time_matrix_K_dropout(time_matrix_K)
        time_matrix_V = self.time_matrix_V_dropout(time_matrix_V)

        # mask 0th items(placeholder for dry-run) in log_seqs
        # would be easier if 0th item could be an exception for training
        timeline_mask = log_seqs == 0
        seqs *= (log_seqs != 0).astype(paddle.get_default_dtype()).unsqueeze(
            -1)  # broadcast in last dim

        tl = seqs.shape[1]  # time dim len for enforce causality
        attention_mask = (
            paddle.tril(paddle.ones([tl, tl])) == 0).astype(paddle.bool)

        for i in range(len(self.attention_layers)):
            # Self-attention, Q=layernorm(seqs), K=V=seqs
            # seqs = paddle.transpose(seqs, 0, 1) # (N, T, C) -> (T, N, C)
            Q = self.attention_layernorms[i](seqs)
            mha_outputs = self.attention_layers[i](
                Q, seqs, timeline_mask, attention_mask, time_matrix_K,
                time_matrix_V, abs_pos_K, abs_pos_V)
            seqs = Q + mha_outputs
            # seqs = paddle.transpose(seqs, 0, 1) # (T, N, C) -> (N, T, C)

            # Point-wise Feed-forward, actually 2 Conv1D for channel wise fusion
            seqs = self.forward_layernorms[i](seqs)
            seqs = self.forward_layers[i](seqs)

            seqs *= (timeline_mask.astype(int) == 0
                     ).astype(paddle.get_default_dtype()).unsqueeze(-1)

        log_feats = self.last_layernorm(seqs)

        return log_feats
Exemple #3
0
    def _rel_shift(self, x, zero_triu=False):
        x_shape = x.shape
        zero_pad = paddle.zeros(
            [x_shape[0], x_shape[1], x_shape[2], 1], dtype=x.dtype)
        x_padded = paddle.concat([zero_pad, x], axis=-1)

        x_padded = paddle.reshape(
            x_padded,
            shape=[x_shape[0], x_shape[1], x_shape[3] + 1, x_shape[2]])

        x = paddle.reshape(x_padded[:, :, 1:, :], shape=x_shape)

        if zero_triu:
            ones = paddle.ones([x_shape[2], x_shape[3]])
            x = x * paddle.tril(
                ones, diagonal=x_shape[3] - x_shape[2]).unsqueeze([2, 3])

        return x
Exemple #4
0
def future_mask(time_steps, dtype="bool"):
    """Generate lower triangular mask.
    
    It is used at transformer decoder to prevent the decoder to see future 
    information.

    Parameters
    ----------
    time_steps : int
        Decoder time steps.
    dtype : str, optional
        The data type of the generate mask, by default "bool".

    Returns
    -------
    Tensor
        The generated mask.
    """
    mask = paddle.tril(paddle.ones([time_steps, time_steps]))
    return paddle.cast(mask, dtype)
Exemple #5
0
    def forward(self, x, kv_cache=None):
        self.seq_len = x.shape[1]
        x = self.query_key_value(x)
        q, k, v = x.split(num_or_sections=3, axis=2)

        q = self.split_heads(q)
        k = self.split_heads(k)
        v = self.split_heads(v)

        if kv_cache is not None:
            pk, pv = paddle.unstack(kv_cache, axis=1)
            k = paddle.concat([pk, k], axis=-2)
            v = paddle.concat([pv, v], axis=-2)
        cached_kv = paddle.stack([k, v], axis=1)

        attn = paddle.matmul(q, k, transpose_y=True)  # [B, N, L, S]
        attn = attn / math.sqrt(self.size_per_head)

        # [L, S]
        attention_mask = paddle.tril(
            paddle.ones([self.seq_len, self.seq_len], 'float32'))
        attention_mask = attention_mask.reshape(
            [1, 1, self.seq_len, self.seq_len])

        # adding to softmax -> its like removing them entirely
        attn = attn * attention_mask - 10000.0 * (1.0 - attention_mask)
        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)

        y = paddle.matmul(attn, v)
        # [B, N, L, S] -> [B, L, N, S]
        y = y.transpose((0, 2, 1, 3))
        y = paddle.reshape(y, [-1, self.seq_len, self.embedding_size])
        y = self.resid_drop(self.dense(y))

        return y, cached_kv
Exemple #6
0
    def _forward(self, dec_inputs, mems=None):
        bsz, qlen = dec_inputs.shape

        word_emb = self.word_emb(dec_inputs)

        mlen = mems[0].shape[1] if mems is not None else 0
        klen = mlen + qlen
        if self.same_length:
            all_ones = paddle.ones(shape=[qlen, klen], dtype=word_emb.dtype)
            mask_len = klen - self.mem_len
            if mask_len > 0:
                mask_shift_len = qlen - mask_len
            else:
                mask_shift_len = qlen
            dec_attn_mask = (paddle.triu(
                all_ones, diagonal=1 + mlen) + paddle.tril(
                    all_ones, -mask_shift_len)).unsqueeze([0, 1])
        else:
            dec_attn_mask = paddle.ones(
                shape=[qlen, klen], dtype=word_emb.dtype)
            dec_attn_mask = paddle.triu(
                dec_attn_mask, diagonal=1 + mlen).unsqueeze([0, 1])

        hids = []
        if self.attn_type == 0:
            pos_seq = paddle.arange(klen - 1, -1, -1.0, dtype=word_emb.dtype)
            if self.clamp_len > 0:
                # TODO: clamp and clip
                pos_seq = paddle.clip(pos_seq, max=self.clamp_len)
            pos_emb = self.pos_emb(pos_seq, bsz)

            core_out = self.drop(word_emb)
            pos_emb = self.drop(pos_emb)

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                core_out = layer(
                    core_out,
                    pos_emb,
                    self.r_w_bias,
                    self.r_r_bias,
                    dec_attn_mask=dec_attn_mask,
                    mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 1:
            core_out = self.drop(word_emb)
            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                if self.clamp_len > 0:
                    r_emb = self.r_emb[i][-self.clamp_len:]
                    r_bias = self.r_bias[i][-self.clamp_len:]
                else:
                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]

                mems_i = None if mems is None else mems[i]
                core_out = layer(
                    core_out,
                    r_emb,
                    self.r_w_bias[i],
                    r_bias,
                    dec_attn_mask=dec_attn_mask,
                    mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 2:
            pos_seq = paddle.arange(klen - 1, -1, -1.0, dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq = paddle.clip(pos_seq, max=self.clamp_len)
            pos_emb = self.pos_emb(pos_seq, bsz)

            core_out = self.drop(word_emb + pos_emb[-qlen:])

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                if mems_i is not None and i == 0:
                    mems_i += pos_emb[:mlen]
                core_out = layer(
                    core_out, dec_attn_mask=dec_attn_mask, mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 3:
            core_out = self.drop(word_emb)

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                if mems_i is not None and mlen > 0:
                    cur_emb = self.r_emb[i][:-qlen]
                    cur_size = cur_emb.size(0)
                    if cur_size < mlen:
                        cur_emb_pad = cur_emb[0:1].expand(mlen - cur_size, -1,
                                                          -1)
                        cur_emb = paddle.concat([cur_emb_pad, cur_emb], 0)
                    else:
                        cur_emb = cur_emb[-mlen:]
                    mems_i += cur_emb.view(mlen, 1, -1)
                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)

                core_out = layer(
                    core_out, dec_attn_mask=dec_attn_mask, mems=mems_i)
                hids.append(core_out)

        core_out = self.drop(core_out)

        new_mems = self._update_mems(hids, mems, mlen, qlen)

        return core_out, new_mems
Exemple #7
0
        def deep_match(item_his_eb, context_his_eb, mask, match_mask,
                       mid_his_batch, item_vectors, item_biases, n_mid):
            query = context_his_eb
            query = self.query_layer(
                query)  # [-1, self.history_length, self.main_embedding_size*2]
            query = self.query_prelu(query)

            inputs = paddle.concat(
                [
                    query, item_his_eb, query - item_his_eb,
                    query * item_his_eb
                ],
                axis=-1)  # B,T,E
            att_layer1 = self.att_layer1_layer(inputs)
            att_layer1 = F.sigmoid(att_layer1)
            att_layer2 = self.att_layer2_layer(att_layer1)
            att_layer2 = F.sigmoid(att_layer2)
            att_layer3 = self.att_layer3_layer(att_layer2)  # B,T,1
            scores = paddle.transpose(att_layer3, [0, 2, 1])  # B,1,T

            # mask
            bool_mask = paddle.equal(mask, paddle.ones_like(mask))  # B,T
            key_masks = paddle.unsqueeze(bool_mask, axis=1)  # B,1,T
            paddings = paddle.ones_like(scores) * (-2**32 + 1)
            scores = paddle.where(key_masks, scores, paddings)

            # tril
            scores_tile = paddle.tile(
                paddle.sum(scores, axis=1),
                [1, paddle.shape(scores)[-1]])  # B, T*T
            scores_tile = paddle.reshape(scores_tile, [
                -1, paddle.shape(scores)[-1], paddle.shape(scores)[-1]
            ])  # B, T, T
            diag_vals = paddle.ones_like(scores_tile)  # B, T, T
            tril = paddle.tril(diag_vals)
            paddings = paddle.ones_like(tril) * (-2**32 + 1)
            scores_tile = paddle.where(
                paddle.equal(tril, paddle.full([1], 0.0, "float32")), paddings,
                scores_tile)  # B, T, T
            scores_tile = F.softmax(scores_tile)  # B, T, T

            att_dm_item_his_eb = paddle.matmul(scores_tile,
                                               item_his_eb)  # B, T, E
            dnn_layer1 = self.dnn_layer1_layer(att_dm_item_his_eb)
            dnn_layer1 = dnn_layer1.reshape(
                [-1, self.history_length, self.main_embedding_size])  ##
            dnn_layer1 = self.dnn_layer1_prelu(dnn_layer1)

            # target mask
            user_vector = dnn_layer1[:, -1, :]  # B, E
            user_vector2 = dnn_layer1[:, -2, :] * paddle.reshape(
                match_mask,
                [-1, paddle.shape(match_mask)[1], 1])[:, -2, :]  # B, E

            num_sampled = 2000
            labels = paddle.reshape(mid_his_batch[:, -1], [-1, 1])  # B, 1

            # not sample, slow
            # [B, E] * [E_size, cate_size]
            logits = paddle.matmul(
                user_vector2, item_vectors, transpose_y=True)
            logits = paddle.add(logits, item_biases)
            loss = F.cross_entropy(input=logits, label=labels)

            return loss, user_vector, scores
# [N, G]  是否是gt。
is_gt = np.array([[1, 1, 0], [1, 1, 1]]).astype(np.float32)

is_in_boxes_or_center = np.array([[3, 100, 103, 2, 109],
                                  [3, 100, 103, 2, 109]]).astype(np.float32)

cost = paddle.to_tensor(cost)
dynamic_ks = paddle.to_tensor(dynamic_ks)
is_gt = paddle.to_tensor(is_gt)

max_dynamic_ks = dynamic_ks.max(-1)  # [N, ]  每张图片所有gt的dynamic_ks的最大值
max_k = max_dynamic_ks.max()  # [1, ]  所有图片所有gt的dynamic_ks的最大值

# 下三角全是1的矩阵
topk_mask = paddle.ones((max_k, max_k), 'float32')  # [max_k, max_k]
topk_mask = paddle.tril(topk_mask, diagonal=0)  # [max_k, max_k]
fill_value = paddle.gather(topk_mask,
                           dynamic_ks.reshape(
                               (-1, )) - 1)  # [N*G, max_k]   填入matching_matrix
fill_value *= is_gt.reshape((-1, 1))  # [N*G, max_k]  还要处理假gt,假gt处全部填0
fill_value = fill_value.reshape((-1, ))  # [N*G*max_k, ]   填入matching_matrix
# 不放心的话,再次将假gt的cost增大
cost += (1.0 - is_gt.unsqueeze(2)) * 100000.0
min_cost, min_cost_index = paddle.topk(cost,
                                       k=max_k,
                                       axis=2,
                                       largest=False,
                                       sorted=True)

matching_matrix = paddle.zeros([
    N * G * A,
Exemple #9
0
    def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, N, G, A,
                           is_in_boxes_or_center, is_gt):
        # Dynamic K
        # ---------------------------------------------------------------
        # cost.shape = [N, G, A]  每张图片 所有gt 和 所有预测框 两两之间的cost。
        # pair_wise_ious.shape = [N, G, A]  每张图片 所有gt 和 所有预测框 两两之间的iou。
        # gt_classes.shape = [N*G, ]  每张图片所有gt的类别id。
        # is_in_boxes_or_center.shape = [N, A]  每个格子是否是在 任意gt内部 或 任意gt的镜像gt内部(不要求同一个gt)。候选正样本处为1。
        # is_gt.shape = [N, G]   是真gt处为1。

        # 4-2-5-1.每个gt应该分配给几个预测框(格子)。
        # 表示最多只抽 与每个gt iou最高的10个预测框(格子)。
        n_candidate_k = 10
        # [N, G, n_candidate_k] 表示对于每个gt,选出前n_candidate_k个与它iou最高的预测框。
        topk_ious, _ = paddle.topk(pair_wise_ious, n_candidate_k, axis=-1)

        # [N, G]  最匹配当前gt的前n_candidate_k个的预测框iou求和。
        dynamic_ks = topk_ious.sum(-1)
        dynamic_ks = paddle.clip(
            dynamic_ks, 1.0, np.inf)  # [N, G]   dynamic_ks限制在区间[1.0, np.inf]内
        dynamic_ks = paddle.cast(dynamic_ks,
                                 'int32')  # [N, G]   取整。表示每个gt应分配给了几个预测框。最少1个。
        max_dynamic_ks = dynamic_ks.max(-1)  # [N, ]  每张图片所有gt的dynamic_ks的最大值
        max_k = max_dynamic_ks.max()  # [1, ]  所有图片所有gt的dynamic_ks的最大值

        # 4-2-5-2.根据4-2-5-1步,构造一个形状为[N, G, A]的matching_matrix,
        # 每个gt前dynamic_ks个cost最小的预测框处填入1,代表gt分配给了这个预测框。
        # 不放心的话,再次将假gt的cost增大。因为不能用假gt确定最终正样本。
        cost += (1.0 - is_gt.unsqueeze(2)) * 100000.0
        # 不放心的话,再次将非候选正样本的cost增大。因为非候选正样本没有资格成为最终正样本。
        cost += (1.0 - is_in_boxes_or_center.unsqueeze(1)) * 100000.0
        # min_cost。       [N, G, max_k] 每个gt,取前max_k个cost最小的cost
        # min_cost_index。 [N, G, max_k] 每个gt,取前max_k个cost最小的cost的坐标。即哪些预测框(格子)与这个gt的cost最小。
        min_cost, min_cost_index = paddle.topk(cost,
                                               k=max_k,
                                               axis=2,
                                               largest=False,
                                               sorted=True)

        matching_matrix = paddle.zeros([
            N * G * A,
        ], 'float32')  # [N*G*A, ]
        gt_ind = paddle.arange(end=N * G, dtype='int32').unsqueeze(
            -1)  # [N*G, 1]  每个gt在matching_matrix中的下标。
        min_cost_index = min_cost_index.reshape((N * G, max_k))  # [N*G, max_k]
        min_cost_index = gt_ind * A + min_cost_index  # [N*G, max_k]
        min_cost_index = min_cost_index.flatten()  # [N*G*max_k, ]

        # 下三角全是1的矩阵
        topk_mask = paddle.ones((max_k, max_k), 'float32')  # [max_k, max_k]
        topk_mask = paddle.tril(topk_mask, diagonal=0)  # [max_k, max_k]
        fill_value = paddle.gather(topk_mask,
                                   dynamic_ks.reshape((-1, )) -
                                   1)  # [N*G, max_k]   填入matching_matrix
        fill_value *= is_gt.reshape((-1, 1))  # [N*G, max_k]  还要处理假gt,假gt处全部填0
        fill_value = fill_value.reshape(
            (-1, ))  # [N*G*max_k, ]   填入matching_matrix

        # 填入matching_matrix
        matching_matrix = paddle.scatter(matching_matrix,
                                         min_cost_index,
                                         fill_value,
                                         overwrite=True)
        matching_matrix = matching_matrix.reshape((N, G, A))  # [N, G, A]

        # 4-2-5-3.如果有预测框anchor(花心大萝卜)匹配到了1个以上的gt时,做特殊处理。
        # 因为不可能让1个预测框学习多个gt,它只有85位信息,做不到;做法是让预测框学习与其具有最小cost的gt。
        # [N, A]  每个预测框(格子)匹配到了几个gt?
        anchor_matching_gt = matching_matrix.sum(1)

        # 如果有预测框(花心大萝卜)匹配到了1个以上的gt时,做特殊处理。
        if paddle.cast(anchor_matching_gt > 1, 'float32').sum() > 0:
            # 首先,找到与花心大萝卜具有最小cost的gt。
            # 找到 花心大萝卜 的下标(这是在anchor_matching_gt.shape[N, A]中的下标)。假设有R个花心大萝卜。
            index = paddle.nonzero(
                anchor_matching_gt >
                1)  # [R, 2]  每个花心大萝卜2个坐标。第0个坐标表示第几张图片,第1个坐标表示第几个格子。
            cost_t = cost.transpose(
                (0, 2, 1))  # [N, G, A] -> [N, A, G]  转置好提取其cost
            cost2 = paddle.gather_nd(
                cost_t, index)  # [R, G]  抽出 R个花心大萝卜 与 gt 两两之间的cost。
            cost2 = cost2.transpose((1, 0))  # [G, R]  gt 与 R个花心大萝卜 两两之间的cost。
            cost_argmin = cost2.argmin(
                axis=0)  # [R, ]  为 每个花心大萝卜 找到 与其cost最小的gt 的下标

            # 准备one_hot
            one_hots = F.one_hot(cost_argmin, num_classes=G)  # [R, G]
            # 花心大萝卜 处 填入one_hot
            matching_matrix = matching_matrix.transpose(
                (0, 2, 1))  # [N, G, A] -> [N, A, G]  转置好以让scatter()填入
            matching_matrix = matching_matrix.reshape(
                (N * A, G))  # [N*A, G]  reshape好以让scatter()填入
            index = index[:, 0] * A + index[:, 1]
            matching_matrix = paddle.scatter(
                matching_matrix, index, one_hots,
                overwrite=True)  # [N*A, G]  scatter()填入

            # matching_matrix变回原来的形状
            matching_matrix = matching_matrix.reshape((N, A, G))  # [N, A, G]
            matching_matrix = matching_matrix.transpose(
                (0, 2, 1))  # [N, A, G] -> [N, G, A]

        # 4-2-5-4.收尾工作,准备监督信息以计算损失。
        # 第一步,准备 置信度obj-ness 需要的监督信息。
        # [N, A]  是否是前景(最终正样本)
        fg_mask = matching_matrix.sum(1) > 0.0  # [N, A]
        fg_mask = paddle.cast(
            fg_mask, 'float32')  # [N, A]   fg_mask作用是监督置信度,计算置信度损失。是最终正样本处为1。
        num_fg = fg_mask.sum()  # 所有图片前景个数

        # 第二步,准备 各类别概率 需要的监督信息。确定最终正样本需要学习的类别id。
        # 最终正样本在fg_mask.shape=[N, A]中的坐标
        pos_index = paddle.nonzero(fg_mask > 0)  # [num_fg, 2]
        image_id = pos_index[:, 0]  # [num_fg, ]  最终正样本是第几张图片的最终正样本。

        matching_matrix_t = matching_matrix.transpose(
            (0, 2, 1))  # [N, G, A] -> [N, A, G]  转置好以便gather_nd()
        matched_gt_inds = paddle.gather_nd(matching_matrix_t,
                                           pos_index)  # [num_fg, G]
        matched_gt_inds = matched_gt_inds.argmax(
            1)  # [num_fg, ]  最终正样本是匹配到了第几个gt(每张图片在[G, ]中的坐标)
        matched_gt_inds += image_id * G  # [num_fg, ]  最终正样本是匹配到了第几个gt(在gt_classes.shape=[N*G, ]中的坐标)
        # 最终正样本需要学习的类别id
        gt_matched_classes = paddle.gather(gt_classes,
                                           matched_gt_inds)  # [num_fg, ]

        # 第三步,取出最终正样本和所学gt的iou。
        # [N, G, A]    所有gt 和 所有预测框 两两之间的iou。matching_matrix第1维其实最多只有1个值非0,所以变成了最终正样本和所学gt的iou。
        ious = (matching_matrix * pair_wise_ious)
        # [N, A]       最终正样本和所学gt的iou。
        ious = ious.sum(1)
        # [num_fg, ]   取出最终正样本和所学gt的iou。
        pred_ious_this_matching = paddle.gather_nd(ious, pos_index)
        # 返回这些:
        # num_fg。                  [1, ]       所有图片前景(最终正样本)个数
        # gt_matched_classes。      [num_fg, ]  最终正样本需要学习的类别id
        # pred_ious_this_matching。 [num_fg, ]  最终正样本和所学gt的iou
        # matched_gt_inds。         [num_fg, ]  最终正样本是匹配到了第几个gt(在gt_classes.shape=[N*G, ]中的坐标)
        # fg_mask。                 [N, A]      最终正样本处为1
        return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds, fg_mask