def check_main(self, x_np, weight_np, bias_np, dtype): paddle.disable_static() weight_np = weight_np.astype(dtype) bias_np = bias_np.astype(dtype) x = paddle.to_tensor(x_np) weight = paddle.to_tensor(weight_np) bias = paddle.to_tensor(bias_np) x.stop_gradient = False weight.stop_gradient = False bias.stop_gradient = False y = F.layer_norm(x, x.shape[1:], weight, bias) x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) y_np = y.numpy().astype('float32') x_g_np = x_g.numpy().astype('float32') w_g_np = w_g.numpy().astype('float16') b_g_np = b_g.numpy().astype('float32') paddle.enable_static() return y_np, x_g_np, w_g_np, b_g_np
def check_main(self, x_np, weight_np, bias_np, dtype): paddle.disable_static() x = paddle.to_tensor(x_np) weight = paddle.to_tensor(weight_np) bias = paddle.to_tensor(bias_np) if dtype == "bfloat16": x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16) x.stop_gradient = False weight.stop_gradient = False bias.stop_gradient = False y = F.layer_norm(x, x.shape[1:], weight, bias) x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) y_np = y.cast('float32').numpy() x_g_np = x_g.cast('float32').numpy() w_g_np = w_g.cast('float32').numpy() b_g_np = b_g.cast('float32').numpy() paddle.enable_static() return y_np, x_g_np, w_g_np, b_g_np
def test_static(self): paddle.enable_static() default_main_program().random_seed = 42 dtype = "float32" layer_norm_dtype = "float32" batch_size = 1 d_model = 8 dim_feedforward = 8 x = paddle.static.data(name='x', shape=[batch_size, d_model, dim_feedforward], dtype=dtype) linear1_weight = paddle.static.data(name='linear1_weight', shape=[d_model, dim_feedforward], dtype=dtype) linear1_bias = paddle.static.data(name='linear1_bias', shape=[dim_feedforward]) linear2_weight = paddle.static.data(name='linear2_weight', shape=[dim_feedforward, d_model], dtype=dtype) linear2_bias = paddle.static.data(name='linear2_bias', shape=[d_model]) ln1_scale = paddle.static.data(name='ln1_scale', shape=[d_model]) ln1_bias = paddle.static.data(name='ln1_scale', shape=[d_model]) ln2_scale = paddle.static.data(name='ln2_scale', shape=[d_model]) ln2_bias = paddle.static.data(name='ln2_scale', shape=[d_model]) fused_out = incubate_f.fused_feedforward(x, linear1_weight, linear2_weight, linear1_bias, linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias, 0.0, 0.0, activation="relu", pre_layer_norm=False) ######base ffn###### linear1_out = F.linear(x, linear1_weight, linear1_bias) act_out = F.relu(linear1_out) dropout1_out = F.dropout(x=act_out, p=0.0, training=False) linear2_out = F.linear(dropout1_out, linear2_weight, linear2_bias) dropout2_out = x + F.dropout(x=linear2_out, p=0.0, training=False) ln_out = F.layer_norm(dropout2_out, normalized_shape=list([d_model]), weight=ln2_scale, bias=ln2_bias) ######base ffn###### exe = paddle.static.Executor(paddle.CUDAPlace(0)) x_data = np.random.random( (batch_size, d_model, dim_feedforward)).astype(dtype) linear1_weight_data = np.random.random( (d_model, dim_feedforward)).astype(dtype) linear1_bias_data = np.zeros((dim_feedforward)).astype(dtype) linear2_weight_data = np.random.random( (dim_feedforward, d_model)).astype(dtype) linear2_bias_data = np.zeros((d_model)).astype(dtype) ln1_scale_data = np.ones((d_model)).astype(layer_norm_dtype) ln1_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) ln2_scale_data = np.ones((d_model)).astype(layer_norm_dtype) ln2_bias_data = np.zeros((d_model)).astype(layer_norm_dtype) res_list = [fused_out, ln_out] real_res = [] for res in res_list: fetch = exe.run(feed={ 'x': x_data, 'linear1_weight': linear1_weight_data, 'linear1_bias': linear1_bias_data, 'linear2_weight': linear2_weight_data, 'linear2_bias': linear2_bias_data, 'ln1_scale': ln1_scale_data, 'ln1_bias': ln1_bias_data, 'ln2_scale': ln2_scale_data, 'ln2_bias': ln2_bias_data }, fetch_list=[res]) real_res.append(fetch) self.assertTrue(np.allclose(real_res[0], real_res[1], atol=1e-3), "two value is check diff")