def __init__(self, num_layers, d_model, heads, d_ff, copy_attn,
                 self_attn_type, dropout, attention_dropout, embeddings,
                 max_relative_positions, aan_useffn, full_context_alignment,
                 alignment_layer, alignment_heads, args):
        super(TransformerDecoder, self).__init__()

        self.args = args
        self.embeddings = embeddings

        # Decoder State
        self.state = {}

        self.transformer_layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model,
                heads,
                d_ff,
                dropout,
                attention_dropout,
                self_attn_type=self_attn_type,
                max_relative_positions=max_relative_positions,
                aan_useffn=aan_useffn,
                full_context_alignment=full_context_alignment,
                alignment_heads=alignment_heads) for i in range(num_layers)
        ])

        # previously, there was a GlobalAttention module here for copy
        # attention. But it was never actually used -- the "copy" attention
        # just reuses the context attention.
        self._copy = copy_attn
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.alignment_layer = alignment_layer
 def __init__(self, layer_num, head_num, head_size, weights):
     super().__init__()
     self.layer_num = layer_num
     self.hidden_dim = head_num * head_size
     self.decoders = torch.nn.ModuleList()
     for i in range(layer_num):
         self.decoders.append(TransformerDecoderLayer(self.hidden_dim, head_num, 4 * self.hidden_dim, 0, 0))
     for i in range(layer_num):
         self.decoders[i].layer_norm_1.weight.data = weights.w[i][0]
         self.decoders[i].layer_norm_1.bias.data = weights.w[i][1]
         self.decoders[i].self_attn.linear_query.weight.data = weights.w[i][2].transpose(-1, -2).contiguous()
         self.decoders[i].self_attn.linear_keys.weight.data = weights.w[i][3].transpose(-1, -2).contiguous()
         self.decoders[i].self_attn.linear_values.weight.data = weights.w[i][4].transpose(-1, -2).contiguous()
         self.decoders[i].self_attn.linear_query.bias.data = weights.w[i][5]
         self.decoders[i].self_attn.linear_keys.bias.data = weights.w[i][6]
         self.decoders[i].self_attn.linear_values.bias.data = weights.w[i][7]
         self.decoders[i].self_attn.final_linear.weight.data = weights.w[i][8].transpose(-1, -2).contiguous()
         self.decoders[i].self_attn.final_linear.bias.data = weights.w[i][9]
         self.decoders[i].layer_norm_2.weight.data = weights.w[i][10]
         self.decoders[i].layer_norm_2.bias.data = weights.w[i][11]
         self.decoders[i].context_attn.linear_query.weight.data = weights.w[i][12].transpose(-1, -2).contiguous()
         self.decoders[i].context_attn.linear_keys.weight.data = weights.w[i][13].transpose(-1, -2).contiguous()
         self.decoders[i].context_attn.linear_values.weight.data = weights.w[i][14].transpose(-1, -2).contiguous()
         self.decoders[i].context_attn.linear_query.bias.data = weights.w[i][15]
         self.decoders[i].context_attn.linear_keys.bias.data = weights.w[i][16]
         self.decoders[i].context_attn.linear_values.bias.data = weights.w[i][17]
         self.decoders[i].context_attn.final_linear.weight.data = weights.w[i][18].transpose(-1, -2).contiguous()
         self.decoders[i].context_attn.final_linear.bias.data = weights.w[i][19]
         self.decoders[i].feed_forward.layer_norm.weight.data = weights.w[i][20]
         self.decoders[i].feed_forward.layer_norm.bias.data = weights.w[i][21]
         self.decoders[i].feed_forward.w_1.weight.data = weights.w[i][22].transpose(-1, -2).contiguous()
         self.decoders[i].feed_forward.w_1.bias.data = weights.w[i][23]
         self.decoders[i].feed_forward.w_2.weight.data = weights.w[i][24].transpose(-1, -2).contiguous()
         self.decoders[i].feed_forward.w_2.bias.data = weights.w[i][25]
    def from_onmt(transformer_decoder_layer: OnmtTransformerDecoderLayer):
        params = {
            k: v
            for k, v in transformer_decoder_layer.named_parameters()
        }
        # for k, v in transformer_decoder_layer.named_parameters():
        #     print(k, v.size())

        # 12: self_attn.linear_keys.weight torch.Size([1024, 1024])
        # 12: self_attn.linear_keys.bias torch.Size([1024])
        # 12: self_attn.linear_values.weight torch.Size([1024, 1024])
        # 12: self_attn.linear_values.bias torch.Size([1024])
        # 12: self_attn.linear_query.weight torch.Size([1024, 1024])
        # 12: self_attn.linear_query.bias torch.Size([1024])
        # 12: self_attn.final_linear.weight torch.Size([1024, 1024])
        # 12: self_attn.final_linear.bias torch.Size([1024])
        # 12: context_attn.linear_keys.weight torch.Size([1024, 1024])
        # 12: context_attn.linear_keys.bias torch.Size([1024])
        # 12: context_attn.linear_values.weight torch.Size([1024, 1024])
        # 12: context_attn.linear_values.bias torch.Size([1024])
        # 12: context_attn.linear_query.weight torch.Size([1024, 1024])
        # 12: context_attn.linear_query.bias torch.Size([1024])
        # 12: context_attn.final_linear.weight torch.Size([1024, 1024])
        # 12: context_attn.final_linear.bias torch.Size([1024])
        # 12: feed_forward.w_1.weight torch.Size([1, 1024])
        # 12: feed_forward.w_1.bias torch.Size([1])
        # 12: feed_forward.w_2.weight torch.Size([1024, 1])
        # 12: feed_forward.w_2.bias torch.Size([1024])
        # 12: feed_forward.layer_norm.weight torch.Size([1024])
        # 12: feed_forward.layer_norm.bias torch.Size([1024])
        # 12: layer_norm_1.weight torch.Size([1024])
        # 12: layer_norm_1.bias torch.Size([1024])
        # 12: layer_norm_2.weight torch.Size([1024])
        # 12: layer_norm_2.bias torch.Size([1024])
        # 12: w_1.weight torch.Size([1, 1024])
        # 12: w_1.bias torch.Size([1])
        # 12: w_2.weight torch.Size([1024, 1])
        # 12: w_2.bias torch.Size([1024])
        # 12: layer_norm.weight torch.Size([1024])
        # 12: layer_norm.bias torch.Size([1024])

        self_attn = MultiHeadedAttention.from_onmt(
            transformer_decoder_layer.self_attn,
            transformer_decoder_layer.layer_norm_1)
        context_attn = MultiHeadedAttention.from_onmt(
            transformer_decoder_layer.context_attn,
            transformer_decoder_layer.layer_norm_2)
        feed_forward = PositionwiseFeedForward.from_onmt(
            transformer_decoder_layer.feed_forward)

        return TransformerDecoderLayer(self_attn, context_attn, feed_forward)
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.model_dim = 1024
            self.onmt_decoder = TransformerDecoderLayer(d_model=self.model_dim,
                                                        heads=8,
                                                        d_ff=1024,
                                                        dropout=0.,
                                                        attention_dropout=0.)
            self.onmt_decoder.eval()
            if use_cuda:
                self.onmt_decoder.to(self.test_device)
            self.turbo_decoder = turbo_transformers.TransformerDecoderLayer.from_onmt(
                self.onmt_decoder)

            # https://pytorch.org/docs/stable/quantization.html
            if with_quantize_dynamic and not use_cuda:
                self.quantized_onmt_decoder = torch.quantization.quantize_dynamic(
                    self.onmt_decoder)
Exemple #5
0
    def __init__(self, num_layers, d_model, heads, head_size, d_ff, copy_attn,
                 self_attn_type, dropout, attention_dropout, embeddings,
                 max_relative_positions, aan_useffn, full_context_alignment,
                 alignment_layer, alignment_heads, args):
        super(TransformerDecoder, self).__init__()

        self.args = args
        if not self.args.model_type:
            raise ValueError("no model_type is supplied.")
        self.embeddings = embeddings

        # relevant to custom cache config
        # self.use_batch_major_op_cache = False
        # self.op_cache_dim_x = 1
        self.is_fp16 = True if self.args.data_type == 'fp16' else False
        self.use_batch_major_op_cache, self.op_cache_dim_x = get_op_cache_config(
            head_size, self.is_fp16)
        self.head_num = heads
        self.size_per_head = head_size

        # Decoder State
        self.state = {}

        self.transformer_layers = nn.ModuleList([
            TransformerDecoderLayer(
                d_model,
                heads,
                d_ff,
                dropout,
                attention_dropout,
                self_attn_type=self_attn_type,
                max_relative_positions=max_relative_positions,
                aan_useffn=aan_useffn,
                full_context_alignment=full_context_alignment,
                alignment_heads=alignment_heads) for i in range(num_layers)
        ])

        # previously, there was a GlobalAttention module here for copy
        # attention. But it was never actually used -- the "copy" attention
        # just reuses the context attention.
        self._copy = copy_attn
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)

        self.alignment_layer = alignment_layer
Exemple #6
0
    def __init__(self, config: ModelConfig, data_config: DataConfig, encoder_embeddings):
        super().__init__()

        self.embeddings = Embedding(num_embeddings=data_config.output_translation_vocabulary_sizes[0][0],
                                    embedding_dim=config.encoder_output_size,
                                    padding_idx=pad_token_index)

        self.positional_encoding = PositionalEncoding(config.encoder_output_size)

        if config.decoder_translation_scale_embeddings:
            self.embeddings_scale = math.sqrt(float(config.encoder_output_size))
        else:
            self.embeddings_scale = None

        self.dropout = Dropout(config.decoder_translation_transformer_dropout)

        if config.decoder_translation_share_encoder_embeddings:
            assert(self.embeddings.weight.shape == encoder_embeddings.get_lut_embeddings().weight.shape)
            self.embeddings.weight = encoder_embeddings.get_lut_embeddings().weight

        self.transformer_layers = ModuleList([TransformerDecoderLayer(d_model=config.encoder_output_size,
                                                                      heads=config.decoder_translation_transformer_heads,
                                                                      d_ff=config.decoder_translation_transformer_hidden_size,
                                                                      dropout=config.decoder_translation_transformer_dropout,
                                                                      attention_dropout=config.decoder_translation_transformer_dropout)
                                              for _ in range(config.decoder_translation_transformer_layers)])

        self.layer_norm = LayerNorm(config.encoder_output_size, eps=1e-6)

        self.linear: Linear = Linear(in_features=config.encoder_output_size, out_features=data_config.output_translation_vocabulary_sizes[0][0])

        if config.decoder_translation_share_embeddings:
            self.linear.weight = self.embeddings.weight

        self.linear_features = None
        if data_config.output_translation_features > 1:
            self.linear_features = ModuleList([Linear(in_features=config.encoder_output_size,
                                                      out_features=data_config.output_translation_vocabulary_sizes[0][i])
                                               for i in range(1, data_config.output_translation_features)])

        self.max_seq_out_len = 150
        self.beam_size = 1
        self.state = {}
    class TestDecoder(unittest.TestCase):
        def init_data(self, use_cuda):
            self.test_device = torch.device('cuda:0') if use_cuda else \
                   torch.device('cpu:0')
            if not use_cuda:
                torch.set_num_threads(4)
                turbo_transformers.set_num_threads(4)

            torch.set_grad_enabled(False)
            self.model_dim = 1024
            self.onmt_decoder = TransformerDecoderLayer(d_model=self.model_dim,
                                                        heads=8,
                                                        d_ff=1024,
                                                        dropout=0.,
                                                        attention_dropout=0.)
            self.onmt_decoder.eval()
            if use_cuda:
                self.onmt_decoder.to(self.test_device)
            self.turbo_decoder = turbo_transformers.TransformerDecoderLayer.from_onmt(
                self.onmt_decoder)

            # https://pytorch.org/docs/stable/quantization.html
            if with_quantize_dynamic and not use_cuda:
                self.quantized_onmt_decoder = torch.quantization.quantize_dynamic(
                    self.onmt_decoder)

        def check_torch_and_turbo(self, use_cuda, num_iter=1):
            deivce_type = "GPU" if use_cuda else "CPU"
            info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\""

            step = 2
            self.init_data(use_cuda=use_cuda)

            self.inputs = torch.rand(batch_size,
                                     T,
                                     self.model_dim,
                                     dtype=torch.float32,
                                     device=self.test_device)
            self.memory_bank = torch.rand(batch_size,
                                          src_length,
                                          self.model_dim,
                                          dtype=torch.float32,
                                          device=self.test_device)

            self.src_pad_mask = torch.zeros(batch_size,
                                            1,
                                            src_length,
                                            dtype=torch.float32,
                                            device=self.test_device).bool()
            self.tgt_pad_mask = torch.zeros(batch_size,
                                            1,
                                            T,
                                            dtype=torch.float32,
                                            device=self.test_device).bool()

            onmt_model = lambda: self.onmt_decoder(self.inputs,
                                                   self.memory_bank,
                                                   self.src_pad_mask,
                                                   self.tgt_pad_mask,
                                                   layer_cache=None,
                                                   step=step,
                                                   future=False)

            onmt_result, torch_qps, torch_time_consume = \
                test_helper.run_model(onmt_model, use_cuda, num_iter)

            onmt_mid, attns, attn_align = onmt_result

            print(
                f"ONMT Deocder {info} ",
                f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}")

            if with_quantize_dynamic and not use_cuda:
                quantized_onmt_model = lambda: self.quantized_onmt_decoder(
                    self.inputs,
                    self.memory_bank,
                    self.src_pad_mask,
                    self.tgt_pad_mask,
                    layer_cache=None,
                    step=step,
                    future=False)

                quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \
                    test_helper.run_model(quantized_onmt_model, use_cuda, num_iter)

                quantized_onmt_mid, quantized_attns, quantized_attn_align = quantized_onmt_result

                print(
                    f"ONMT Quantized Deocder {info} ",
                    f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}"
                )

                # print(onmt_mid)
                # print(quantized_onmt_mid)

                # self.assertTrue(
                #     torch.max(torch.abs(onmt_mid -
                #                         quantized_onmt_mid)) < (1e-3 if use_cuda else 1e-4))
                # self.assertTrue(
                #     torch.max(torch.abs(attns - quantized_attns)) < (
                #         1e-3 if use_cuda else 1e-4))

            turbo_model = lambda: self.turbo_decoder(self.inputs,
                                                     self.memory_bank,
                                                     self.src_pad_mask,
                                                     self.tgt_pad_mask,
                                                     layer_cache=None,
                                                     step=step,
                                                     future=False)

            with turbo_transformers.pref_guard(info) as perf:
                turbo_result, turbo_qps, turbo_time_consume = \
                    test_helper.run_model(turbo_model, use_cuda, num_iter)

            turbo_mid, turbo_attns, _ = turbo_result

            print(
                f"Turbo Deocder {info} ",
                f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}")

            self.assertTrue(
                torch.max(torch.abs(onmt_mid -
                                    turbo_mid)) < (1e-3 if use_cuda else 1e-4))
            self.assertTrue(
                torch.max(torch.abs(attns - turbo_attns)) < (
                    1e-3 if use_cuda else 1e-4))

            if with_quantize_dynamic and not use_cuda:
                with open(fname, "a") as fh:
                    fh.write(
                        f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n"
                    )
            else:
                with open(fname, "a") as fh:
                    fh.write(f"{info} {torch_qps}, {turbo_qps}\n")

        def test_decoder(self):
            self.check_torch_and_turbo(use_cuda=False)
            if torch.cuda.is_available() and \
                turbo_transformers.config.is_compiled_with_cuda():
                self.check_torch_and_turbo(use_cuda=True)