def __init__(self, num_layers, d_model, heads, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, args): super(TransformerDecoder, self).__init__() self.args = args self.embeddings = embeddings # Decoder State self.state = {} self.transformer_layers = nn.ModuleList([ TransformerDecoderLayer( d_model, heads, d_ff, dropout, attention_dropout, self_attn_type=self_attn_type, max_relative_positions=max_relative_positions, aan_useffn=aan_useffn, full_context_alignment=full_context_alignment, alignment_heads=alignment_heads) for i in range(num_layers) ]) # previously, there was a GlobalAttention module here for copy # attention. But it was never actually used -- the "copy" attention # just reuses the context attention. self._copy = copy_attn self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.alignment_layer = alignment_layer
def __init__(self, layer_num, head_num, head_size, weights): super().__init__() self.layer_num = layer_num self.hidden_dim = head_num * head_size self.decoders = torch.nn.ModuleList() for i in range(layer_num): self.decoders.append(TransformerDecoderLayer(self.hidden_dim, head_num, 4 * self.hidden_dim, 0, 0)) for i in range(layer_num): self.decoders[i].layer_norm_1.weight.data = weights.w[i][0] self.decoders[i].layer_norm_1.bias.data = weights.w[i][1] self.decoders[i].self_attn.linear_query.weight.data = weights.w[i][2].transpose(-1, -2).contiguous() self.decoders[i].self_attn.linear_keys.weight.data = weights.w[i][3].transpose(-1, -2).contiguous() self.decoders[i].self_attn.linear_values.weight.data = weights.w[i][4].transpose(-1, -2).contiguous() self.decoders[i].self_attn.linear_query.bias.data = weights.w[i][5] self.decoders[i].self_attn.linear_keys.bias.data = weights.w[i][6] self.decoders[i].self_attn.linear_values.bias.data = weights.w[i][7] self.decoders[i].self_attn.final_linear.weight.data = weights.w[i][8].transpose(-1, -2).contiguous() self.decoders[i].self_attn.final_linear.bias.data = weights.w[i][9] self.decoders[i].layer_norm_2.weight.data = weights.w[i][10] self.decoders[i].layer_norm_2.bias.data = weights.w[i][11] self.decoders[i].context_attn.linear_query.weight.data = weights.w[i][12].transpose(-1, -2).contiguous() self.decoders[i].context_attn.linear_keys.weight.data = weights.w[i][13].transpose(-1, -2).contiguous() self.decoders[i].context_attn.linear_values.weight.data = weights.w[i][14].transpose(-1, -2).contiguous() self.decoders[i].context_attn.linear_query.bias.data = weights.w[i][15] self.decoders[i].context_attn.linear_keys.bias.data = weights.w[i][16] self.decoders[i].context_attn.linear_values.bias.data = weights.w[i][17] self.decoders[i].context_attn.final_linear.weight.data = weights.w[i][18].transpose(-1, -2).contiguous() self.decoders[i].context_attn.final_linear.bias.data = weights.w[i][19] self.decoders[i].feed_forward.layer_norm.weight.data = weights.w[i][20] self.decoders[i].feed_forward.layer_norm.bias.data = weights.w[i][21] self.decoders[i].feed_forward.w_1.weight.data = weights.w[i][22].transpose(-1, -2).contiguous() self.decoders[i].feed_forward.w_1.bias.data = weights.w[i][23] self.decoders[i].feed_forward.w_2.weight.data = weights.w[i][24].transpose(-1, -2).contiguous() self.decoders[i].feed_forward.w_2.bias.data = weights.w[i][25]
def from_onmt(transformer_decoder_layer: OnmtTransformerDecoderLayer): params = { k: v for k, v in transformer_decoder_layer.named_parameters() } # for k, v in transformer_decoder_layer.named_parameters(): # print(k, v.size()) # 12: self_attn.linear_keys.weight torch.Size([1024, 1024]) # 12: self_attn.linear_keys.bias torch.Size([1024]) # 12: self_attn.linear_values.weight torch.Size([1024, 1024]) # 12: self_attn.linear_values.bias torch.Size([1024]) # 12: self_attn.linear_query.weight torch.Size([1024, 1024]) # 12: self_attn.linear_query.bias torch.Size([1024]) # 12: self_attn.final_linear.weight torch.Size([1024, 1024]) # 12: self_attn.final_linear.bias torch.Size([1024]) # 12: context_attn.linear_keys.weight torch.Size([1024, 1024]) # 12: context_attn.linear_keys.bias torch.Size([1024]) # 12: context_attn.linear_values.weight torch.Size([1024, 1024]) # 12: context_attn.linear_values.bias torch.Size([1024]) # 12: context_attn.linear_query.weight torch.Size([1024, 1024]) # 12: context_attn.linear_query.bias torch.Size([1024]) # 12: context_attn.final_linear.weight torch.Size([1024, 1024]) # 12: context_attn.final_linear.bias torch.Size([1024]) # 12: feed_forward.w_1.weight torch.Size([1, 1024]) # 12: feed_forward.w_1.bias torch.Size([1]) # 12: feed_forward.w_2.weight torch.Size([1024, 1]) # 12: feed_forward.w_2.bias torch.Size([1024]) # 12: feed_forward.layer_norm.weight torch.Size([1024]) # 12: feed_forward.layer_norm.bias torch.Size([1024]) # 12: layer_norm_1.weight torch.Size([1024]) # 12: layer_norm_1.bias torch.Size([1024]) # 12: layer_norm_2.weight torch.Size([1024]) # 12: layer_norm_2.bias torch.Size([1024]) # 12: w_1.weight torch.Size([1, 1024]) # 12: w_1.bias torch.Size([1]) # 12: w_2.weight torch.Size([1024, 1]) # 12: w_2.bias torch.Size([1024]) # 12: layer_norm.weight torch.Size([1024]) # 12: layer_norm.bias torch.Size([1024]) self_attn = MultiHeadedAttention.from_onmt( transformer_decoder_layer.self_attn, transformer_decoder_layer.layer_norm_1) context_attn = MultiHeadedAttention.from_onmt( transformer_decoder_layer.context_attn, transformer_decoder_layer.layer_norm_2) feed_forward = PositionwiseFeedForward.from_onmt( transformer_decoder_layer.feed_forward) return TransformerDecoderLayer(self_attn, context_attn, feed_forward)
def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.model_dim = 1024 self.onmt_decoder = TransformerDecoderLayer(d_model=self.model_dim, heads=8, d_ff=1024, dropout=0., attention_dropout=0.) self.onmt_decoder.eval() if use_cuda: self.onmt_decoder.to(self.test_device) self.turbo_decoder = turbo_transformers.TransformerDecoderLayer.from_onmt( self.onmt_decoder) # https://pytorch.org/docs/stable/quantization.html if with_quantize_dynamic and not use_cuda: self.quantized_onmt_decoder = torch.quantization.quantize_dynamic( self.onmt_decoder)
def __init__(self, num_layers, d_model, heads, head_size, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, args): super(TransformerDecoder, self).__init__() self.args = args if not self.args.model_type: raise ValueError("no model_type is supplied.") self.embeddings = embeddings # relevant to custom cache config # self.use_batch_major_op_cache = False # self.op_cache_dim_x = 1 self.is_fp16 = True if self.args.data_type == 'fp16' else False self.use_batch_major_op_cache, self.op_cache_dim_x = get_op_cache_config( head_size, self.is_fp16) self.head_num = heads self.size_per_head = head_size # Decoder State self.state = {} self.transformer_layers = nn.ModuleList([ TransformerDecoderLayer( d_model, heads, d_ff, dropout, attention_dropout, self_attn_type=self_attn_type, max_relative_positions=max_relative_positions, aan_useffn=aan_useffn, full_context_alignment=full_context_alignment, alignment_heads=alignment_heads) for i in range(num_layers) ]) # previously, there was a GlobalAttention module here for copy # attention. But it was never actually used -- the "copy" attention # just reuses the context attention. self._copy = copy_attn self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.alignment_layer = alignment_layer
def __init__(self, config: ModelConfig, data_config: DataConfig, encoder_embeddings): super().__init__() self.embeddings = Embedding(num_embeddings=data_config.output_translation_vocabulary_sizes[0][0], embedding_dim=config.encoder_output_size, padding_idx=pad_token_index) self.positional_encoding = PositionalEncoding(config.encoder_output_size) if config.decoder_translation_scale_embeddings: self.embeddings_scale = math.sqrt(float(config.encoder_output_size)) else: self.embeddings_scale = None self.dropout = Dropout(config.decoder_translation_transformer_dropout) if config.decoder_translation_share_encoder_embeddings: assert(self.embeddings.weight.shape == encoder_embeddings.get_lut_embeddings().weight.shape) self.embeddings.weight = encoder_embeddings.get_lut_embeddings().weight self.transformer_layers = ModuleList([TransformerDecoderLayer(d_model=config.encoder_output_size, heads=config.decoder_translation_transformer_heads, d_ff=config.decoder_translation_transformer_hidden_size, dropout=config.decoder_translation_transformer_dropout, attention_dropout=config.decoder_translation_transformer_dropout) for _ in range(config.decoder_translation_transformer_layers)]) self.layer_norm = LayerNorm(config.encoder_output_size, eps=1e-6) self.linear: Linear = Linear(in_features=config.encoder_output_size, out_features=data_config.output_translation_vocabulary_sizes[0][0]) if config.decoder_translation_share_embeddings: self.linear.weight = self.embeddings.weight self.linear_features = None if data_config.output_translation_features > 1: self.linear_features = ModuleList([Linear(in_features=config.encoder_output_size, out_features=data_config.output_translation_vocabulary_sizes[0][i]) for i in range(1, data_config.output_translation_features)]) self.max_seq_out_len = 150 self.beam_size = 1 self.state = {}
class TestDecoder(unittest.TestCase): def init_data(self, use_cuda): self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(4) turbo_transformers.set_num_threads(4) torch.set_grad_enabled(False) self.model_dim = 1024 self.onmt_decoder = TransformerDecoderLayer(d_model=self.model_dim, heads=8, d_ff=1024, dropout=0., attention_dropout=0.) self.onmt_decoder.eval() if use_cuda: self.onmt_decoder.to(self.test_device) self.turbo_decoder = turbo_transformers.TransformerDecoderLayer.from_onmt( self.onmt_decoder) # https://pytorch.org/docs/stable/quantization.html if with_quantize_dynamic and not use_cuda: self.quantized_onmt_decoder = torch.quantization.quantize_dynamic( self.onmt_decoder) def check_torch_and_turbo(self, use_cuda, num_iter=1): deivce_type = "GPU" if use_cuda else "CPU" info = f"\"({deivce_type}, {batch_size}, {src_length}, {T})\"" step = 2 self.init_data(use_cuda=use_cuda) self.inputs = torch.rand(batch_size, T, self.model_dim, dtype=torch.float32, device=self.test_device) self.memory_bank = torch.rand(batch_size, src_length, self.model_dim, dtype=torch.float32, device=self.test_device) self.src_pad_mask = torch.zeros(batch_size, 1, src_length, dtype=torch.float32, device=self.test_device).bool() self.tgt_pad_mask = torch.zeros(batch_size, 1, T, dtype=torch.float32, device=self.test_device).bool() onmt_model = lambda: self.onmt_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) onmt_result, torch_qps, torch_time_consume = \ test_helper.run_model(onmt_model, use_cuda, num_iter) onmt_mid, attns, attn_align = onmt_result print( f"ONMT Deocder {info} ", f"{deivce_type} QPS, {torch_qps}, time, {torch_time_consume}") if with_quantize_dynamic and not use_cuda: quantized_onmt_model = lambda: self.quantized_onmt_decoder( self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) quantized_onmt_result, quantized_torch_qps, quantized_torch_time_consume = \ test_helper.run_model(quantized_onmt_model, use_cuda, num_iter) quantized_onmt_mid, quantized_attns, quantized_attn_align = quantized_onmt_result print( f"ONMT Quantized Deocder {info} ", f"{deivce_type} QPS, {quantized_torch_qps}, time, {quantized_torch_time_consume}" ) # print(onmt_mid) # print(quantized_onmt_mid) # self.assertTrue( # torch.max(torch.abs(onmt_mid - # quantized_onmt_mid)) < (1e-3 if use_cuda else 1e-4)) # self.assertTrue( # torch.max(torch.abs(attns - quantized_attns)) < ( # 1e-3 if use_cuda else 1e-4)) turbo_model = lambda: self.turbo_decoder(self.inputs, self.memory_bank, self.src_pad_mask, self.tgt_pad_mask, layer_cache=None, step=step, future=False) with turbo_transformers.pref_guard(info) as perf: turbo_result, turbo_qps, turbo_time_consume = \ test_helper.run_model(turbo_model, use_cuda, num_iter) turbo_mid, turbo_attns, _ = turbo_result print( f"Turbo Deocder {info} ", f"{deivce_type} QPS, {turbo_qps}, time, {turbo_time_consume}") self.assertTrue( torch.max(torch.abs(onmt_mid - turbo_mid)) < (1e-3 if use_cuda else 1e-4)) self.assertTrue( torch.max(torch.abs(attns - turbo_attns)) < ( 1e-3 if use_cuda else 1e-4)) if with_quantize_dynamic and not use_cuda: with open(fname, "a") as fh: fh.write( f"{info} {torch_qps}, {quantized_torch_qps}, {turbo_qps}\n" ) else: with open(fname, "a") as fh: fh.write(f"{info} {torch_qps}, {turbo_qps}\n") def test_decoder(self): self.check_torch_and_turbo(use_cuda=False) if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True)