def _test_fused_tril_softmax_mask_scale(test_case, seq_length, channel, p, diagonal, tril_scale_value): x = np.random.randn(4, seq_length, channel) # fused version only support in GPU fused_x_tensor = flow.Tensor(x).to("cuda") fused_x_tensor.requires_grad = True fused_out = flow._C.fused_scale_tril_softmax_mask_scale( fused_x_tensor, p=p, diagonal=diagonal, tril_scale_value=tril_scale_value)[0] # The second output is softmax_y origin_x_tensor = flow.Tensor(x).to("cuda") origin_x_tensor.requires_grad = True origin_out = flow.tril(origin_x_tensor, diagonal) origin_out = origin_out * tril_scale_value origin_out = flow.softmax(origin_out, dim=-1) origin_out = flow._C.dropout(origin_out, p=p) total_out = fused_out.sum() + origin_out.sum() total_out.backward() test_case.assertTrue( np.allclose(fused_out.numpy(), origin_out.numpy(), atol=1e-4, rtol=1e-4)) test_case.assertTrue( np.allclose( fused_x_tensor.grad.numpy(), origin_x_tensor.grad.numpy(), atol=1e-4, rtol=1e-4, ))
def _test_fused_scale_tril( test_case, shape, diagonal=0, scale=1.0, ): x = np.random.rand(*shape) # Different dtype will result in insert of cast op causing pass to fail. tensor_x = flow.tensor(x, device="cuda", dtype=flow.float32) eager_out = flow.tril(tensor_x, diagonal) * scale class TestFuseScaleTril(flow.nn.Graph): def __init__(self): super().__init__() def build(self): return flow.tril(tensor_x * scale, diagonal) lazy_out_0 = TestFuseScaleTril()() test_case.assertTrue(np.allclose(eager_out.numpy(), lazy_out_0.numpy())) class TestFuseTrilScale(flow.nn.Graph): def __init__(self): super().__init__() def build(self): return flow.tril(tensor_x, diagonal) * scale lazy_out_1 = TestFuseTrilScale()() test_case.assertTrue(np.allclose(eager_out.numpy(), lazy_out_1.numpy()))
def __init__(self, config): super(GPT2Attention, self).__init__() max_positions = config.max_position_embeddings self.register_buffer( "bias", flow.tril( flow.ones((max_positions, max_positions), dtype=flow.int8)).view(1, 1, max_positions, max_positions), ) self.register_buffer("masked_bias", flow.tensor(-1e4)) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads assert self.embed_dim % self.num_heads == 0 self.head_dim = self.embed_dim // self.num_heads self.scale_attn_weights = config.scale_attn_weights self.c_attn = Conv1D(self.embed_dim * 3, self.embed_dim) self.c_proj = Conv1D(self.embed_dim, self.embed_dim) self.attn_dropout = nn.Dropout(config.attn_pdrop) self.resid_dropout = nn.Dropout(config.resid_pdrop)
def _tril(self, diagonal=0): return flow.tril(self, diagonal=diagonal)
def get_seq_mask(targets): batch_size, steps = targets.size() seq_mask = flow.ones([batch_size, steps, steps], device=targets.device) seq_mask = flow.tril(seq_mask).to(flow.int8) return seq_mask
def build(self): return flow.tril(tensor_x, diagonal) * scale