def _test_fused_tril_softmax_mask_scale(test_case, seq_length, channel, p,
                                        diagonal, tril_scale_value):
    x = np.random.randn(4, seq_length, channel)
    # fused version only support in GPU
    fused_x_tensor = flow.Tensor(x).to("cuda")
    fused_x_tensor.requires_grad = True
    fused_out = flow._C.fused_scale_tril_softmax_mask_scale(
        fused_x_tensor,
        p=p,
        diagonal=diagonal,
        tril_scale_value=tril_scale_value)[0]  # The second output is softmax_y

    origin_x_tensor = flow.Tensor(x).to("cuda")
    origin_x_tensor.requires_grad = True
    origin_out = flow.tril(origin_x_tensor, diagonal)
    origin_out = origin_out * tril_scale_value
    origin_out = flow.softmax(origin_out, dim=-1)
    origin_out = flow._C.dropout(origin_out, p=p)

    total_out = fused_out.sum() + origin_out.sum()
    total_out.backward()

    test_case.assertTrue(
        np.allclose(fused_out.numpy(),
                    origin_out.numpy(),
                    atol=1e-4,
                    rtol=1e-4))
    test_case.assertTrue(
        np.allclose(
            fused_x_tensor.grad.numpy(),
            origin_x_tensor.grad.numpy(),
            atol=1e-4,
            rtol=1e-4,
        ))
def _test_fused_scale_tril(
    test_case,
    shape,
    diagonal=0,
    scale=1.0,
):
    x = np.random.rand(*shape)
    # Different dtype will result in insert of cast op causing pass to fail.
    tensor_x = flow.tensor(x, device="cuda", dtype=flow.float32)
    eager_out = flow.tril(tensor_x, diagonal) * scale

    class TestFuseScaleTril(flow.nn.Graph):
        def __init__(self):
            super().__init__()

        def build(self):
            return flow.tril(tensor_x * scale, diagonal)

    lazy_out_0 = TestFuseScaleTril()()
    test_case.assertTrue(np.allclose(eager_out.numpy(), lazy_out_0.numpy()))

    class TestFuseTrilScale(flow.nn.Graph):
        def __init__(self):
            super().__init__()

        def build(self):
            return flow.tril(tensor_x, diagonal) * scale

    lazy_out_1 = TestFuseTrilScale()()
    test_case.assertTrue(np.allclose(eager_out.numpy(), lazy_out_1.numpy()))
Example #3
0
    def __init__(self, config):
        super(GPT2Attention, self).__init__()
        max_positions = config.max_position_embeddings

        self.register_buffer(
            "bias",
            flow.tril(
                flow.ones((max_positions, max_positions),
                          dtype=flow.int8)).view(1, 1, max_positions,
                                                 max_positions),
        )
        self.register_buffer("masked_bias", flow.tensor(-1e4))

        self.embed_dim = config.hidden_size
        self.num_heads = config.num_attention_heads
        assert self.embed_dim % self.num_heads == 0
        self.head_dim = self.embed_dim // self.num_heads
        self.scale_attn_weights = config.scale_attn_weights

        self.c_attn = Conv1D(self.embed_dim * 3, self.embed_dim)
        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)

        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
Example #4
0
def _tril(self, diagonal=0):
    return flow.tril(self, diagonal=diagonal)
Example #5
0
def get_seq_mask(targets):
    batch_size, steps = targets.size()
    seq_mask = flow.ones([batch_size, steps, steps], device=targets.device)
    seq_mask = flow.tril(seq_mask).to(flow.int8)
    return seq_mask
 def build(self):
     return flow.tril(tensor_x, diagonal) * scale