Ejemplo n.º 1
0
    def test_arange(test_case):
        np_out = np.arange(5)
        of_out = flow.arange(0, end=5)
        test_case.assertTrue(np.allclose(of_out.numpy(), np_out))

        np_out2 = np.arange(0, 20, 2)
        of_out2 = flow.arange(0, 20, step=2)
        test_case.assertTrue(np.allclose(of_out2.numpy(), np_out2))
Ejemplo n.º 2
0
    def test_arange_v2(test_case):
        np_out = np.arange(20)
        of_out = flow.arange(start=0, end=20)
        test_case.assertTrue(np.allclose(of_out.numpy(), np_out))

        np_out2 = np.arange(0, 100, 3)
        of_out2 = flow.arange(start=0, end=100, step=3)
        test_case.assertTrue(np.allclose(of_out2.numpy(), np_out2))
Ejemplo n.º 3
0
 def __init__(self, d_model, max_len=5000):
     super(PositionalEncoding, self).__init__()
     # Compute the positional encodings once in log space.
     pe = flow.zeros(max_len, d_model, requires_grad=False)
     position = flow.arange(0, max_len).unsqueeze(1).to(dtype=flow.float32)
     div_term = flow.exp(
         flow.arange(0, d_model, 2).to(dtype=flow.float32)
         * -(math.log(10000.0) / d_model)
     )
     pe[:, 0::2] = flow.sin(position * div_term)
     pe[:, 1::2] = flow.cos(position * div_term)
     pe = pe.unsqueeze(0)
     self.register_buffer("pe", pe)
Ejemplo n.º 4
0
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = flow.zeros((max_len, d_model))
        position = flow.arange(0, max_len, dtype=flow.float).unsqueeze(1)
        div_term = flow.exp(
            flow.arange(0, d_model, 2).to(flow.float) * (-math.log(10000.0) / d_model)
        ).unsqueeze(0)
        pe[:, 0::2] = flow.sin(position * div_term)
        pe[:, 1::2] = flow.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.pe = flow.nn.Parameter(pe, requires_grad=False)
Ejemplo n.º 5
0
    def forward(self, targets, memory, memory_mask):

        dec_output = self.embedding(targets)
        if self.relative_positional:

            position = flow.arange(
                -(dec_output.size(1) - 1), dec_output.size(1), device=dec_output.device
            ).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            dec_output, pos = self.pos_emb(dec_output)

        dec_mask = get_transformer_decoder_mask(targets)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            dec_output, attn_weight = block(
                dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos
            )
            attn_weights["dec_block_%d" % i] = attn_weight

        if self.normalize_before:
            dec_output = self.after_norm(dec_output)

        logits = self.output_layer(dec_output)

        return logits, attn_weights
def _test_global_stateful_kernel_with_inpersistent_state(test_case, placement, sbp):
    x = (
        flow.arange(64)
        .reshape(8, 8)
        .to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast)
    )
    x = x.to_global(placement, sbp)
    y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1])
    y_np = np.array([[0], [8], [16]])
    test_case.assertTrue(
        np.array_equal(
            y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast)
            .to_local()
            .numpy(),
            y_np,
        )
    )
    x = x.to_global(sbp=flow.sbp.split(1))
    y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1])
    test_case.assertTrue(
        np.array_equal(
            y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast)
            .to_local()
            .numpy(),
            y_np,
        )
    )
Ejemplo n.º 7
0
 def nllloss_1d(self, input, target):
     n = input.shape[0]
     idx = flow.unsqueeze(flow.arange(0, n, 1), dim=1)
     target = flow.unsqueeze(target, dim=1)
     t = flow.cat([idx, target], dim=1)
     res = self._gather_nd_op(input, t)[0]
     return res
Ejemplo n.º 8
0
def _test_arange_backward(test_case, device):
    np_out = np.arange(13)
    x = flow.arange(13, dtype=flow.float32, device=device)
    x.requires_grad = True
    y = x.sum()
    y.backward()
    test_case.assertTrue(np.allclose(x.grad.numpy(), np.ones(13), 1e-05, 1e-05))
Ejemplo n.º 9
0
    def forward(
        self,
        input_ids: flow.Tensor,
        token_type_ids: Optional[flow.Tensor] = None,
        position_ids: Optional[flow.Tensor] = None,
    ) -> flow.Tensor:
        input_shape = input_ids.size()
        seq_length = input_shape[1]

        if token_type_ids is None:
            token_type_ids = flow.zeros(input_shape,
                                        dtype=flow.long,
                                        device=input_ids.device)
        if position_ids is None:
            position_ids = flow.arange(seq_length,
                                       dtype=flow.long,
                                       device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand(input_shape)

        input_embeddings = self.token_embeddings(input_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)
        position_embeddings = self.position_embeddings(position_ids)

        embeddings = input_embeddings + position_embeddings + \
            token_type_embeddings
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings
Ejemplo n.º 10
0
    def __init__(
        self,
        dim,
        window_size,
        num_heads,
        qkv_bias=True,
        qk_scale=None,
        attn_drop=0.0,
        proj_drop=0.0,
    ):

        super().__init__()
        self.dim = dim
        self.window_size = window_size  # Wh, Ww
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim**-0.5

        # define a parameter table of relative position bias
        # Author zzk: we add trunc normal here!
        self.relative_position_bias_table = nn.Parameter(
            flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
                       num_heads))  # 2*Wh-1 * 2*Ww-1, nH
        self.relative_position_bias_table.trunc_normal_(std=0.02)

        # get pair-wise relative position index for each token inside the window
        coords_h = flow.arange(self.window_size[0])
        coords_w = flow.arange(self.window_size[1])
        coords = flow.stack(flow.meshgrid(*[coords_h, coords_w]))  # 2, Wh, Ww
        coords_flatten = flow.flatten(coords, 1)  # 2, Wh*Ww
        relative_coords = (coords_flatten[:, :, None] -
                           coords_flatten[:, None, :])  # 2, Wh*Ww, Wh*Ww
        relative_coords = relative_coords.permute(1, 2, 0)  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :,
                        0] += self.window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += self.window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        self.register_buffer("relative_position_index",
                             relative_position_index)

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)
        self.softmax = nn.Softmax(dim=-1)
Ejemplo n.º 11
0
 def _pos_encoding(self, inputs):
     if self.relative_positional:
         enc_output = inputs
         position = flow.arange(-(inputs.size(1) - 1),
                                inputs.size(1),
                                device=inputs.device).reshape(1, -1)
         pos = self.pos_emb._embedding_from_positions(position)
     else:
         enc_output, pos = self.pos_emb(inputs)
     return enc_output, pos
Ejemplo n.º 12
0
 def test_tensor_scatter_nd_update_runtime_error(test_case):
     with test_case.assertRaises(Exception) as context:
         x = flow.arange(8, dtype=flow.float32, requires_grad=True)
         indices = flow.tensor([[1], [3], [5]])
         updates = flow.tensor([-1, -2, -3],
                               dtype=flow.float64,
                               requires_grad=True)
         y = flow.tensor_scatter_nd_update(x, indices, updates)
     test_case.assertTrue("The dtype of tensor and updates must be same." in
                          str(context.exception))
Ejemplo n.º 13
0
def unpad_sequence(
    padded_sequences: Tensor,
    lengths: Tensor,
    batch_first: bool = False,
) -> List[Tensor]:
    """
    Unpad padded Tensor into a list of variable length Tensors

    ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors.

    Args:
        padded_sequences (Tensor): padded sequences.
        lengths (Tensor): length of original (unpadded) sequences.
        batch_first (bool, optional): whether batch dimension first or not. Default: False.

    Returns:
        a list of :class:`Tensor` objects

    For example:

    .. code-block:: python

        >>> from oneflow.nn.utils.rnn import pad_sequence, unpad_sequence
        >>> import oneflow as flow
        >>> import numpy as np

        >>> a = flow.ones(25, 300)
        >>> b = flow.ones(22, 300)
        >>> c = flow.ones(15, 300)
        >>> sequences = [a, b, c]
        >>> padded_sequences = pad_sequence(sequences)
        >>> lengths = flow.as_tensor([v.size(0) for v in sequences])
        >>> unpadded_sequences = unpad_sequence(padded_sequences, lengths)
        >>> np.allclose(sequences[0].numpy(), unpadded_sequences[0].numpy())
        True
        >>> np.allclose(sequences[1].numpy(), unpadded_sequences[1].numpy())
        True
        >>> np.allclose(sequences[2].numpy(), unpadded_sequences[2].numpy())
        True
    """
    unpadded_sequences = []

    if not batch_first:
        padded_sequences = padded_sequences.permute((1, 0, 2))

    max_length = padded_sequences.shape[1]
    idx = flow.arange(max_length)

    for seq, length in zip(padded_sequences, lengths):
        mask = idx < length
        unpacked_seq = seq[mask]
        unpadded_sequences.append(unpacked_seq)

    return unpadded_sequences
Ejemplo n.º 14
0
def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]:
    if permutation is None:
        return None
    return flow.scatter(
        flow.zeros_like(permutation),
        0,
        permutation,
        flow.arange(0,
                    permutation.numel(),
                    device=permutation.device,
                    dtype=flow.int32),
    )
 def test_stateful_kernel_with_inpersistent_state(test_case):
     x = flow.arange(4).reshape(2, 2)
     x = x.to_global(flow.env.all_device_placement("cuda"), flow.sbp.split(0))
     y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1])
     y_np = np.array([[0], [2], [0]])
     test_case.assertTrue(
         np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np)
     )
     x = x.to_global(sbp=flow.sbp.split(1))
     y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1])
     test_case.assertTrue(
         np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np)
     )
Ejemplo n.º 16
0
def _test_arange_with_random_data(test_case, placement, sbp):
    start = random(0, 10).to(int).value()
    end = start + random(0, 10).to(int).value()
    step = random(1, max(2, end - start)).to(int).value()
    start = start * 8
    end = end * 8
    x = torch.arange(start=start, end=end, step=step)
    x.oneflow = flow.arange(start=start,
                            end=end,
                            step=step,
                            placement=placement,
                            sbp=sbp)
    return x
Ejemplo n.º 17
0
    def decode_step(self, preds, memory, memory_mask, cache, scores, flag):
        """ decode an utterance in a stepwise way"""

        batch_size = int(scores.size(0) / self.beam_width)

        batch_log_probs, dec_cache, dec_attn_weights = self.decode(
            preds, memory, memory_mask, cache["decoder"])

        if self.lm is not None:
            batch_lm_log_probs, lm_hidden = self.lm_decode(preds, cache["lm"])
            batch_lm_log_probs = batch_lm_log_probs.squeeze(1)
            batch_log_probs = batch_log_probs + self.lm_weight * batch_lm_log_probs
        else:
            lm_hidden = None

        if batch_log_probs.dim() == 3:
            batch_log_probs = batch_log_probs.squeeze(1)

        last_k_scores, last_k_preds = batch_log_probs.topk(self.beam_width)

        last_k_scores = mask_finished_scores(last_k_scores, flag)
        last_k_preds = mask_finished_preds(last_k_preds, flag)

        # update scores
        scores = scores + last_k_scores
        scores = scores.view(batch_size, self.beam_width * self.beam_width)

        # pruning
        scores, offset_k_indices = flow.topk(scores, k=self.beam_width)
        scores = scores.view(-1, 1)

        device = scores.device
        base_k_indices = (flow.arange(batch_size, device=device).view(
            -1, 1).repeat([1, self.beam_width]))
        base_k_indices *= self.beam_width**2
        best_k_indices = base_k_indices.view(-1) + offset_k_indices.view(-1)

        # update predictions
        best_k_preds = flow.index_select(last_k_preds.view(-1),
                                         dim=0,
                                         index=best_k_indices).to(flow.int64)

        preds_index = best_k_indices.floor_divide(self.beam_width)
        preds_symbol = flow.index_select(preds, dim=0, index=preds_index)
        preds_symbol = flow.cat(
            [preds_symbol, best_k_preds.view(-1, 1)], dim=1)

        # finished or not
        end_flag = flow.eq(preds_symbol[:, -1], EOS).view(-1, 1).to(flow.uint8)

        return preds_symbol, cache, scores, end_flag
Ejemplo n.º 18
0
def find_pruneable_heads_and_indices(
    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
) -> Tuple[Set[int], flow.Tensor]:

    mask = flow.ones(n_heads, head_size)
    # Convert to set and remove already pruned heads
    heads = set(heads) - already_pruned_heads
    for head in heads:
        # Compute how many pruned heads are before the head and move the index accordingly
        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
        mask[head] = 0
    mask = mask.view(-1).contiguous().eq(1)
    index: flow.Tensor = flow.arange(len(mask), dtype=flow.int64)[mask]
    return heads, index
Ejemplo n.º 19
0
 def forward(self, x: flow.Tensor):
     """Add positional encoding.
     Args:
         x (torch.Tensor): Input. Its shape is (batch, time, ...)
     Returns:
         torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
     """
     pos = flow.arange(0, x.size(1), device=x.device).reshape(1,
                                                              -1)  # [1, t]
     posemb = self._embedding_from_positions(pos)  # [1, t, emb_dim]
     if self.scale_learnable:
         x = x + self.alpha * posemb
     else:
         x = x * self.xscale + posemb
     return self.dropout(x), posemb
Ejemplo n.º 20
0
def _test_arange_with_float_delta(test_case, placement, sbp):
    start = random(0, 10).to(int).value()
    end = start + random(0, 10).to(int).value()
    step = random(1, max(2, end - start)).to(float).value()
    start = start * 8
    end = end * 8
    x = torch.arange(start=start, end=end, step=step, requires_grad=True)
    x.oneflow = flow.arange(
        start=start,
        end=end,
        step=step,
        placement=placement,
        sbp=sbp,
        requires_grad=True,
    )
    return x
Ejemplo n.º 21
0
    def get_extended_attention_mask(self, attention_mask: flow.Tensor,
                                    input_shape: Tuple[int],
                                    device: flow.device) -> flow.Tensor:

        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
        # ourselves in which case we just need to make it broadcastable to all heads.
        if attention_mask.dim() == 3:
            extended_attention_mask = attention_mask[:, None, :, :]
        elif attention_mask.dim() == 2:
            # Provided a padding mask of dimensions [batch_size, seq_length]
            # - if the model is a decoder, apply a causal mask in addition to the padding mask
            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
            if self.is_decoder:
                batch_size, seq_length = input_shape
                seq_ids = flow.arange(seq_length, device=device)
                causal_mask = (seq_ids[None, None, :].repeat(
                    batch_size, seq_length, 1) <= seq_ids[None, :, None])
                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
                causal_mask = causal_mask.to(attention_mask.dtype)

                if causal_mask.shape[1] < attention_mask.shape[1]:
                    prefix_seq_len = attention_mask.shape[
                        1] - causal_mask.shape[1]
                    causal_mask = flow.cat(
                        [
                            flow.ones(
                                (batch_size, seq_length, prefix_seq_len),
                                device=device,
                                dtype=causal_mask.dtype,
                            ),
                            causal_mask,
                        ],
                        axis=-1,
                    )

                extended_attention_mask = (causal_mask[:, None, :, :] *
                                           attention_mask[:, None, None, :])
            else:
                extended_attention_mask = attention_mask[:, None, None, :]
        else:
            raise ValueError(
                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
            )

        extended_attention_mask = extended_attention_mask.to(dtype=flow.float)
        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
        return extended_attention_mask
Ejemplo n.º 22
0
    def _prob_in_top_k(
        self, clean_values, noisy_values, noise_stddev, noisy_top_values
    ):
        """Helper function to NoisyTopKGating.
        Computes the probability that value is in top k, given different random noise.
        This gives us a way of backpropagating from a loss that balances the number
        of times each expert is in the top k experts per example.
        In the case of no noise, pass in None for noise_stddev, and the result will
        not be differentiable.
        Args:
        clean_values: a `Tensor` of shape [batch, n].
        noisy_values: a `Tensor` of shape [batch, n].  Equal to clean values plus
          normally distributed noise with standard deviation noise_stddev.
        noise_stddev: a `Tensor` of shape [batch, n], or None
        noisy_top_values: a `Tensor` of shape [batch, m].
           "values" Output of tf.top_k(noisy_top_values, m).  m >= k+1
        Returns:
        a `Tensor` of shape [batch, n].
        """

        batch = clean_values.size(0)
        m = noisy_top_values.size(1)
        top_values_flat = noisy_top_values.flatten()

        threshold_positions_if_in = (
            flow.arange(batch, device=noisy_values.device) * m + self.k
        )

        threshold_if_in = flow.unsqueeze(
            flow.gather(top_values_flat, 0, threshold_positions_if_in), 1
        )
        is_in = flow.gt(noisy_values, threshold_if_in)

        threshold_positions_if_out = threshold_positions_if_in - 1
        threshold_if_out = flow.unsqueeze(
            flow.gather(top_values_flat, 0, threshold_positions_if_out), 1
        )

        # is each value currently in the top k.
        prob_if_in = cdf((clean_values - threshold_if_in) / noise_stddev)
        prob_if_out = cdf((clean_values - threshold_if_out) / noise_stddev)

        prob = flow.where(is_in, prob_if_in, prob_if_out)
        return prob
Ejemplo n.º 23
0
def select_chunk_states_and_mask_based_index(tensor, tensor_mask, index):
    # tensor: [b, c, t, v]
    # index: [b]
    # return [b, t, v]
    assert tensor.dim() == 4
    assert tensor_mask.dim() == 3
    assert index.dim() == 1

    b, c, t, v = tensor.size()

    base_index = flow.arange(b, device=tensor.device) * c
    indices = base_index + index

    select_tensor = flow.index_select(tensor.reshape(b * c, t, v), 0,
                                      indices.long())
    select_tensor_mask = flow.index_select(tensor_mask.reshape(b * c, 1, t), 0,
                                           indices.long())

    return select_tensor, select_tensor_mask
Ejemplo n.º 24
0
    def __init__(
        self,
        vocab_size,
        type_vocab_size,
        max_position_embeddings,
        hidden_size,
        hidden_dropout_prob,
        seq_length,
    ):
        super().__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

        self.LayerNorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True)
        self.register_buffer(
            "position_ids", flow.arange(max_position_embeddings).unsqueeze(0)
        )
        self.seq_length = seq_length
Ejemplo n.º 25
0
    def forward(self, inputs, mask):
        if self.relative_positional:
            enc_output = inputs

            position = flow.arange(-(inputs.size(1) - 1),
                                   inputs.size(1),
                                   device=inputs.device).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            enc_output, pos = self.pos_emb(inputs)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            enc_output, attn_weight = block(enc_output, mask.unsqueeze(1), pos)
            attn_weights["enc_block_%d" % i] = attn_weight

        if self.normalize_before:
            enc_output = self.norm(enc_output)

        return enc_output, mask, attn_weights
Ejemplo n.º 26
0
 def _embedding_from_positions(self, position):
     """get absolute pos embedding based position.
     Args:
         position (torch.Tensor): Input. Its shape is (b, t)
     Returns:
         posemb (torch.Tensor): Encoded tensor. Its shape is (b, time, emb_dim)
     """
     batch_size, time_step = position.size()
     posemb = flow.zeros(batch_size,
                         time_step,
                         self.emb_dim,
                         device=position.device)
     div_term = flow.exp(
         flow.arange(
             0, self.emb_dim, 2, device=position.device, dtype=flow.float32)
         * -(math.log(10000.0) / self.emb_dim))
     posemb[:, :,
            0::2] = flow.sin(position.float().unsqueeze(-1) * div_term)
     posemb[:, :,
            1::2] = flow.cos(position.float().unsqueeze(-1) * div_term)
     return posemb
Ejemplo n.º 27
0
    def test_arange_graph(test_case):
        of_eager_out = flow.arange(start=0,
                                   end=100,
                                   step=3,
                                   device=flow.device("cuda"))

        class ArangeGraph(flow.nn.Graph):
            def __init__(self):
                super().__init__()

            def build(self):
                return flow.arange(start=0,
                                   end=100,
                                   step=3,
                                   device=flow.device("cuda"))

        arange_g = ArangeGraph()
        of_lazy_out = arange_g()
        test_case.assertTrue(
            np.allclose(of_eager_out.numpy(), of_lazy_out.numpy(), 1e-05,
                        1e-05))
Ejemplo n.º 28
0
    def lm_rescoring(self, preds, pred_lens):
        # preds [beam_size, lens]
        # preds_len [beam_size]

        if self.lm.model_type == "transformer_lm":
            log_probs = self.lm.predict(preds, last_frame=False)
        else:
            log_probs = []
            hidden = None
            for t in range(preds.size(1)):
                log_prob, hidden = self.lm.predict(preds[:, t].unsqueeze(-1),
                                                   hidden)
                log_probs.append(log_prob)

            log_probs = flow.cat(log_probs, dim=1)

        rescores = []
        max_length = log_probs.size(1)
        vocab_size = log_probs.size(-1)

        for b in range(preds.size(0)):
            base_index = flow.arange(max_length, device=preds.device)
            bias_index = preds[b].reshape(-1)

            index = base_index * vocab_size + bias_index
            score = flow.index_select(log_probs[b].reshape(-1),
                                      dim=-1,
                                      index=index)

            label_len = min(int(pred_lens[b]), score.size(0))
            score[label_len - 1:] = 0
            rescores.append(flow.sum(score) / label_len)

        rescores = flow.tensor(rescores, dtype=flow.float32)
        _, indices = flow.sort(rescores, dim=-1, descending=True)

        sorted_preds = preds[indices]
        sorted_length = pred_lens[indices]

        return sorted_preds, sorted_length
Ejemplo n.º 29
0
def _make_causal_mask(
    input_ids_shape: flow.Size,
    dtype: flow.dtype,
    device: flow.device,
    past_key_values_length: int = 0,
):
    """
    Make causal mask used for bi-directional self-attention.
    """
    bsz, tgt_len = input_ids_shape
    mask = flow.ones((tgt_len, tgt_len)) * float("-inf")
    mask_cond = flow.arange(mask.size(-1))
    mask = mask.masked_fill(mask_cond < (mask_cond + 1).view(mask.size(-1), 1),
                            0)
    mask = mask.to(dtype)

    if past_key_values_length > 0:
        mask = flow.cat(
            [flow.zeros(tgt_len, past_key_values_length, dtype=dtype), mask],
            dim=-1)
    return (mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len +
                                          past_key_values_length).to(device))
Ejemplo n.º 30
0
def select_tensor_based_index(tensor, index):
    # tensor: [b, c, t, v]
    # index: [b]
    # return [b, t, v]
    assert tensor.dim() >= 2
    assert index.dim() == 1

    batch_size = tensor.size(0)
    tensor_len = tensor.size(1)

    base_index = flow.arange(batch_size, device=tensor.device) * tensor_len
    indices = base_index + index

    if tensor.dim() == 2:
        select_tensor = flow.index_select(
            tensor.reshape(batch_size * tensor_len), 0, indices.long())
    else:
        assert tensor.dim() == 3
        select_tensor = flow.index_select(
            tensor.reshape(batch_size * tensor_len, tensor.size(-1)), 0,
            indices.long())

    return select_tensor