Example #1
0
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1,
                 activation="relu", normalize_before=False,
                 weight_attr=None, bias_attr=None):
        self._config = locals()
        self._config.pop("self")
        self._config.pop("__class__", None)  # py3

        super(TransformerDecoderLayer, self).__init__()
        self.normalize_before = normalize_before

        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)

        self.nhead = nhead
        self.self_attn = MultiHeadAttention(d_model, nhead, dropout=dropout,
                                            weight_attr=weight_attrs[0], bias_attr=bias_attrs[0])
        self.multihead_attn = MultiHeadAttention(d_model, nhead, dropout=dropout,
                                             weight_attr=weight_attrs[1], bias_attr=bias_attrs[1])
        self.linear1 = Linear(d_model, dim_feedforward, 
                              weight_attrs[2], bias_attr=bias_attrs[2])
        self.dropout = Dropout(dropout, mode="upscale_in_train")
        self.linear2 = Linear(dim_feedforward, d_model, 
                              weight_attrs[2], bias_attr=bias_attrs[2])
        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.norm3 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout, mode="upscale_in_train")
        self.dropout2 = Dropout(dropout, mode="upscale_in_train")
        self.dropout3 = Dropout(dropout, mode="upscale_in_train")
        self.activation = getattr(F, activation)
 def setUp(self):
     self.config()
     self.generate_input_data()
     paddle.set_default_dtype(self.x_type)
     self.__class__.op_type = "fused_attention"
     # use autograd to check grad in this unittest.
     self.__class__.no_need_check_grad = True
     self.q_proj = Linear(self.embed_dim,
                          self.embed_dim,
                          self.weight_attr,
                          bias_attr=self.bias_attr)
     self.k_proj = Linear(self.kdim,
                          self.embed_dim,
                          self.weight_attr,
                          bias_attr=self.bias_attr)
     self.v_proj = Linear(self.vdim,
                          self.embed_dim,
                          self.weight_attr,
                          bias_attr=self.bias_attr)
     self.out_proj = Linear(self.embed_dim,
                            self.embed_dim,
                            self.weight_attr,
                            bias_attr=self.bias_attr)
     paddle.set_default_dtype(np.float32)
     self.norm1 = LayerNorm(self.embed_dim)
     self.norm2 = LayerNorm(self.embed_dim)
     paddle.set_default_dtype(self.x_type)
     self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
Example #3
0
    def setUp(self):
        self.config()
        self.generate_input_data()

        self.rtol = 1e-5
        # FIXME(wangxi): Because there is a problem with the test precision
        #  on A100, atol is temporarily set to 1e-2, and it will be
        #  changed back after the precision problem is solved.
        self.atol = 1e-2
        # make sure local development precision
        if "V100" in paddle.device.cuda.get_device_name():
            self.atol = 1e-4
        if self.x_type is np.float16:
            self.atol = 1e-1

        paddle.set_default_dtype(self.x_type)
        self.__class__.op_type = "fused_multi_transformer"
        # use autograd to check grad in this unittest.
        self.__class__.no_need_check_grad = False

        bias_attr = paddle.fluid.ParamAttr(
            initializer=paddle.fluid.initializer.Constant(value=0.0005))
        self.q_proj = Linear(self.embed_dim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=bias_attr)
        #bias_attr=self.bias_attr)

        self.k_proj = Linear(self.kdim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=self.bias_attr)
        self.v_proj = Linear(self.vdim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=self.bias_attr)
        self.out_proj = Linear(self.embed_dim,
                               self.embed_dim,
                               self.weight_attr,
                               bias_attr=self.bias_attr)

        self.ffn1_proj = Linear(self.embed_dim,
                                4 * self.embed_dim,
                                self.weight_attr,
                                bias_attr=self.bias_attr)
        self.ffn2_proj = Linear(4 * self.embed_dim,
                                self.embed_dim,
                                self.weight_attr,
                                bias_attr=self.bias_attr)

        paddle.set_default_dtype(np.float32)
        self.norm = LayerNorm(self.embed_dim)
        self.ffn_norm = LayerNorm(self.embed_dim)

        paddle.set_default_dtype(self.x_type)
        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
        self.activation = getattr(F, self.act_method)
 def setUp(self):
     self.config()
     self.generate_input_data()
     paddle.set_default_dtype(self.x_type)
     self.__class__.op_type = "fused_bias_dropout_residual_layer_norm"
     # use autograd to check grad in this unittest.
     self.__class__.no_need_check_grad = True
     paddle.set_default_dtype(np.float32)
     self.norm1 = LayerNorm(self.embed_dim)
     paddle.set_default_dtype(self.x_type)
     self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
Example #5
0
    def setUp(self):
        paddle.disable_static()
        self.__class__.op_type = "fused_feedforward"
        #check grad in test_out_and_grad()
        self.__class__.no_need_check_grad = True
        self.getDtype()
        self.getShape()
        self.getDiff()
        self.getActivation()
        self.getNormalizeBefore()
        paddle.set_default_dtype(self.dtype)
        self.weight_attr = None
        self.bias_attr = None

        self.weight_attrs = transformer._convert_param_attr_to_list(
            self.weight_attr, 2)
        self.bias_attrs = transformer._convert_param_attr_to_list(
            self.bias_attr, 2)
        self.linear1 = Linear(self.d_model,
                              self.dim_feedforward,
                              self.weight_attrs[1],
                              bias_attr=self.bias_attrs[1])
        self.linear2 = Linear(self.dim_feedforward,
                              self.d_model,
                              self.weight_attrs[1],
                              bias_attr=self.bias_attrs[1])

        paddle.set_default_dtype(self.layer_norm_dtype)
        self.norm1 = LayerNorm(self.d_model)
        self.norm2 = LayerNorm(self.d_model)
        self.dropout = Dropout(0.0, mode="upscale_in_train")
        self.dropout1 = Dropout(0.0, mode="upscale_in_train")
        self.dropout2 = Dropout(0.0, mode="upscale_in_train")
        self.activation = getattr(F, self.act_method)

        self.src = np.random.random((self.batch_size, self.query_length,
                                     self.d_model)).astype(self.dtype)
        self.dout = np.random.random((self.batch_size, self.query_length,
                                      self.d_model)).astype(self.dtype)
Example #6
0
    def setUp(self):
        self.config()
        self.generate_input_data()

        self.rtol = 1e-5
        # FIXME(limin29): Because there is a problem with the test precision
        #  on A100, atol is temporarily set to 1e-2, and it will be
        #  changed back after the precision problem is solved.
        self.atol = 1e-2
        # make sure local development precision
        if "V100" in paddle.device.cuda.get_device_name():
            self.atol = 1e-4
        if self.x_type is np.float16:
            self.atol = 1e-1

        paddle.set_default_dtype(self.x_type)
        self.__class__.op_type = "fused_attention"
        # use autograd to check grad in this unittest.
        self.__class__.no_need_check_grad = True
        self.q_proj = Linear(self.embed_dim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=self.bias_attr)
        self.k_proj = Linear(self.kdim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=self.bias_attr)
        self.v_proj = Linear(self.vdim,
                             self.embed_dim,
                             self.weight_attr,
                             bias_attr=self.bias_attr)
        self.out_proj = Linear(self.embed_dim,
                               self.embed_dim,
                               self.weight_attr,
                               bias_attr=self.bias_attr)
        paddle.set_default_dtype(np.float32)
        self.norm1 = LayerNorm(self.embed_dim)
        self.norm2 = LayerNorm(self.embed_dim)
        paddle.set_default_dtype(self.x_type)
        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")