class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, decoder_embed_dim, decoder_attention_heads, decoder_ffn_embed_dim, decoder_output_dim, max_target_positions, decoder_layers, encoder_embed_dim=-1, no_encoder_attn=False, dict_size=-1): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed if dict_size != -1: input_embed_dim = decoder_embed_dim self.padding_idx = dict_size - 1 self.embed_tokens = nn.Embedding(dict_size, input_embed_dim, padding_idx=self.padding_idx, _weight=embed_tokens) else: input_embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.embed_tokens = embed_tokens embed_dim = decoder_embed_dim self.output_embed_dim = decoder_output_dim self.max_target_positions = max_target_positions self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, decoder_embed_dim, decoder_attention_heads, decoder_ffn_embed_dim, encoder_embed_dim, no_encoder_attn,) for _ in range(decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, features_only=False, **extra_args, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out, incremental_state, **extra_args, ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, full_context_alignment=False, alignment_layer=None, alignment_heads=None, **unused, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = len(self.layers) - 1 # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None cur_tgt_pos = None if incremental_state is None else prev_output_tokens.size(1) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) self_attn_padding_mask[:, 0] = False if not self_attn_padding_mask.any() and not self.cross_self_attention: self_attn_padding_mask = None # decoder layers attn = None attns = [] inner_states = [x] global_vector = None for idx, layer in enumerate(self.layers): encoder_state = None if encoder_out is not None: if self.layer_wise_attention: encoder_state = encoder_out['encoder_states'][idx] else: encoder_state = encoder_out['encoder_out'] if 'global_quantize' in encoder_out: global_vector = encoder_out['global_quantize'] # 1 x B x C if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn = layer( x, encoder_state, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=(idx == alignment_layer or not self.training), need_head_weights=(idx == alignment_layer), cur_tgt_pos=cur_tgt_pos, global_vector=global_vector, ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float() attns.append(attn.mean(dim=0)) if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attns, 'inner_states': inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k] del state_dict[k] version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) self.layers = nn.ModuleList([]) self.layers.extend([ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.num_layers = len(self.layers) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.n_experts = 1 self.nhidlast = self.embed_dim self.ninp = self.embed_dim self.ntoken = 9744 self.prior = nn.Linear(self.nhidlast, self.n_experts, bias=False) self.latent = nn.Sequential( nn.Linear(self.nhidlast, self.n_experts * self.ninp), nn.Tanh())
class TransformerEncoder(FairseqEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None def forward_embedding(self, src_tokens): # embed tokens and positions x = embed = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) if self.layernorm_embedding: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) return x, embed def forward(self, src_tokens, src_lengths, cls_input=None, return_all_hiddens=False, **unused): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: namedtuple: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ if self.layer_wise_attention: return_all_hiddens = True x, encoder_embedding = self.forward_embedding(src_tokens) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None encoder_states = [] if return_all_hiddens else None # encoder layers for layer in self.layers: # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if not self.training or (dropout_probability > self.encoder_layerdrop): x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.layer_norm: x = self.layer_norm(x) if return_all_hiddens: encoder_states[-1] = x return EncoderOut( encoder_out=x, # T x B x C encoder_padding_mask=encoder_padding_mask, # B x T encoder_embedding=encoder_embedding, # B x T x C encoder_states=encoder_states, # List[T x B x C] ) def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out.encoder_out is not None: encoder_out = encoder_out._replace( encoder_out=encoder_out.encoder_out.index_select(1, new_order) ) if encoder_out.encoder_padding_mask is not None: encoder_out = encoder_out._replace( encoder_padding_mask=encoder_out.encoder_padding_mask.index_select(0, new_order) ) if encoder_out.encoder_embedding is not None: encoder_out = encoder_out._replace( encoder_embedding=encoder_out.encoder_embedding.index_select(0, new_order) ) if encoder_out.encoder_states is not None: for idx, state in enumerate(encoder_out.encoder_states): encoder_out.encoder_states[idx] = state.index_select(1, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: print('deleting {0}'.format(weights_key)) del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms self.layers[i].upgrade_state_dict_named(state_dict, "{}.layers.{}".format(name, i)) version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
class TransformerAvgEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None #image section self.img_dim = 2048 self.text_dim = embed_dim self.L2norm = args.L2norm self.total_num_img = args.total_num_img self.per_num_img = args.per_num_img # cap2image_file = args.cap2image_file # image_embedding_file = args.image_embedding_file cap2image_file = getattr(args, "cap2image_file", "data/cap2image.pickle") image_embedding_file = getattr( args, "image_embedding_file", "features_resnet50/train-resnet50-avgpool.npy") self.cap2image = pickle.load(open(cap2image_file, "rb")) #cap_id to image_id #print("image embedding processing...") embeding_weights = np.load(image_embedding_file) img_vocab, img_dim = embeding_weights.shape embeddings_matrix = np.zeros((img_vocab + 1, img_dim)) embeddings_matrix[1:] = embeding_weights self.img_embeddings = nn.Embedding.from_pretrained( torch.FloatTensor(embeddings_matrix), freeze=args.image_emb_fix) # update embedding # self.img_embeddings.load_state_dict({'weight': embeddings_matrix}) # if args.image_emb_fix: # self.img_embeddings.weight.requires_grad = False self.merge_option = args.merge_option self.dense = nn.Linear(self.img_dim, self.text_dim) self.mergeImage = nn.Linear(self.total_num_img, 1) if self.merge_option == "att-mul-concat": self.proj_attention = SCAttention(self.text_dim, 128) self.dense2 = nn.Linear(self.text_dim, 384) elif self.merge_option == "att-concat": self.dense2 = nn.Linear(2 * self.text_dim, self.text_dim) elif self.merge_option == "att-gate": self.gate_type = args.gate_type self.proj_attention = SCAttention(self.text_dim, self.text_dim) if self.gate_type == "neural-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, self.text_dim) elif self.gate_type == "scalar-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, 1) else: self.image_weight = args.image_weight else: self.proj_attention = SCAttention(self.text_dim, self.text_dim) def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ srl_tok_list = src_tokens.tolist() batch_image_ids = [] for batch_idx, sent in enumerate(srl_tok_list): # token2image image_ids = [] for cap in sent: if cap in self.cap2image: for id in self.cap2image[cap][:self.per_num_img]: if id != 0: image_ids.append(id) image_freq = Counter(image_ids) image_sort = sorted(image_freq.items(), key=lambda x: x[1], reverse=True) image_ids = [ item[0] for idx, item in enumerate(image_sort) if idx < self.total_num_img ] #image_ids = image_ids[:self.total_num_img] # Zero-pad up to the sequence length. padding_length = self.total_num_img - len(image_ids) image_ids = image_ids + ([0] * padding_length) batch_image_ids.append(image_ids) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") batch_image_ids = torch.LongTensor(batch_image_ids).to(device) #image embedding batch_size, num_img = batch_image_ids.size() #print(batch_image_ids[0]) image_padding_mask = batch_image_ids.eq(0) #print(image_padding_mask[0]) image_mask = ~image_padding_mask # print(image_mask[0]) # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask # print(src_tokens) encoder_padding_mask = src_tokens.eq(self.padding_idx) text_mask = ~encoder_padding_mask #print(encoder_padding_mask.size()) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.layer_norm: x = self.layer_norm(x) # image_ids_flat = image_ids.view(batch_size, -1) image_ids_flat = batch_image_ids #batch_size x num_image image_embedding = self.img_embeddings(image_ids_flat) image_embedding = image_embedding.view( batch_size, num_img, self.img_dim) # batch_size, num_img, dim #L2 norm if self.L2norm == "true": image_embedding = F.normalize(image_embedding, p=2, dim=1) # attention on each local region text_repr = x.transpose(0, 1) # T x B x C -> batch_size, seq_len, dim image_repr = self.dense( image_embedding) # batch_size, num_img, image_dim - > text_dim if self.merge_option == "biatt": output = self.proj_attention(image_repr, image_mask, text_repr, text_mask) output = self.proj_attention(text_repr, text_mask, output, image_mask) #batch_size, seq_len, dim elif self.merge_option == "att": output = self.proj_attention(text_repr, text_mask, image_repr, image_mask) #batch_size, seq_len, dim elif self.merge_option == "att-sum": output = self.proj_attention( text_repr, text_mask, image_repr, image_mask) # batch_size, seq_len, dim output = text_repr + output #0.5, 1 elif self.merge_option == "att-gate": output = self.proj_attention( text_repr, text_mask, image_repr, image_mask) # batch_size, seq_len, dim if self.gate_type == "neural-gate": merge = torch.cat([text_repr, output], dim=-1) gate = self.sigmoid(self.gate_dense(merge)) output = (1 - gate) * text_repr + gate * output #print("neural-gate") elif self.gate_type == "scalar-gate": merge = torch.cat([text_repr, output], dim=-1) gate = self.sigmoid(self.gate_dense(merge)) output = (1 - gate) * text_repr + gate * output #print("scalar-gate") else: output = 1.0 * text_repr + self.image_weight * output else: output = None #print(output.size()) x = output.transpose(0, 1) # batch_size, seq_len, dim -> T x B x C #print(x) # print(x.size()) # print(encoder_padding_mask.size()) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms self.layers[i].upgrade_state_dict_named( state_dict, "{}.layers.{}".format(name, i)) version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
class TransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward(self, src_tokens, src_lengths, sent_id): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.layer_norm: x = self.layer_norm(x) # print(x.size()) # print(encoder_padding_mask.size()) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms self.layers[i].upgrade_state_dict_named( state_dict, "{}.layers.{}".format(name, i)) version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.tpu = False # whether we're on TPU self.embed_tokens = self.build_embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = ( PositionalEmbedding( 514, #self.max_seq_len, self.embedding_dim, padding_idx= None, #(self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend([ self.build_transformer_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) for _ in range(num_encoder_layers) ]) self.crossthought_layers = nn.ModuleList([]) self.crossthought_layers.extend([ self.build_transformer_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) for _ in range(2) ]) self.pre_train = False if 'CTPRETRAIN' not in os.environ else os.environ[ 'CTPRETRAIN'] == 'True' self.short_seq_len = 64 self.sent_emb_num = 5 if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = (Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim)
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout = args.embed_dropout self.embed_tokens = embed_tokens self.embed_dim = embed_tokens.embedding_dim self.embed_scale = math.sqrt(self.embed_dim) self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_positions = PositionalEmbedding( self.max_target_positions, self.embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.decoder_layers = args.decoder_layers self.convlayers = nn.ModuleList([ DynamicConvDecoderLayer(args.decoder_embed_dim, args.decoder_embed_dim, args.decoder_attention_heads, args.decoder_kernel_size_list[i], input_dropout=args.conv_input_dropout, weight_dropout=args.conv_weight_dropout, dropout=args.conv_output_dropout) for i in range(self.decoder_layers) ]) self.attnlayers = nn.ModuleList([ AttentionDecoderLayer(args.decoder_embed_dim, args.decoder_attention_heads, self_attention=True, attention_dropout=args.attn_weight_dropout, dropout=args.attn_output_dropout) for _ in range(self.decoder_layers) ]) self.encattnlayers = nn.ModuleList([ AttentionDecoderLayer(args.decoder_embed_dim, args.decoder_attention_heads, self_attention=False, attention_dropout=args.attn_weight_dropout, dropout=args.attn_output_dropout) for _ in range(self.decoder_layers) ]) self.fflayers = nn.ModuleList([ FFLayer(args.decoder_embed_dim, args.decoder_ffn_embed_dim, relu_dropout=args.ff_relu_dropout, dropout=args.ff_output_dropout) for _ in range(self.decoder_layers) ]) self.ratios = nn.Parameter(torch.FloatTensor(self.decoder_layers, 1), requires_grad=True) self.ratios.data.fill_(0.5) # self.ratios = [nn.Parameter(torch.tensor([0.5])).cuda() for _ in range(6)] # self.ratios = [nn.Parameter(torch.FloatTensor(1), requires_grad=True).cuda() for _ in range(6)] # for ratio in self.ratios: # ratio.data.fill_(0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(self.embed_dim) self.embed_out = Linear(self.embed_dim, len(dictionary))
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.dropout = args.decoder_dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_embed_dim args.encoder_embed_dim = embed_dim self.layerdrop = args.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) args = copy.deepcopy(args) args.dropout = args.decoder_dropout args.attention_dropout = args.decoder_attention_dropout args.activation_dropout = args.decoder_activation_dropout self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.onnx_trace = False self.decoder_max_order = args.decoder_max_order self.clamp_value = getattr(args, 'clamp_value', 0.01) self.gs_clamp = args.gs_clamp def set_perm_order(self, perm_order=0): assert isinstance(perm_order, int) and 0 <= perm_order <= 5 for layer in self.layers: layer.set_perm_order(perm_order) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features(prev_output_tokens, encoder_out, incremental_state) x = self.output_layer(x, encoder_out) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, encoder_out, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return [ F.linear(features, self.embed_tokens.weight), encoder_out['encoder_pred_order'] ] else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict def get_normalized_probs(self, net_output, log_probs, sample, gs_tau=0.5, gs_hard=False): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: if sample is not None: assert 'target' in sample target = sample['target'] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0][0] orders = net_output[0][1] if log_probs: return (utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) else: return (utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace), *self.gumbel_softmax( orders, gs_tau=gs_tau, gs_hard=gs_hard, dim=-1)) def gumbel_softmax(self, logits, gs_tau=0.5, gs_hard=False, dim=-1): if not gs_hard: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) logprob = torch.log(prob_clamp if self.gs_clamp else prob) gs = F.gumbel_softmax( logprob, tau=gs_tau, hard=False, ) else: prob = utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) prob_clamp = torch.clamp( prob, self.clamp_value, 1. - (self.decoder_max_order - 1) * self.clamp_value) max_idx = torch.argmax(logits, -1, keepdim=True) one_hot = logits.new_zeros(logits.size()) gs = one_hot.scatter(-1, max_idx, 1) return gs, prob, prob_clamp
class HybridEncoder(FairseqEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout = args.embed_dropout self.embed_tokens = embed_tokens self.embed_dim = embed_tokens.embedding_dim self.embed_scale = math.sqrt(self.embed_dim) self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_positions = PositionalEmbedding( self.max_source_positions, self.embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.encoder_layers = args.encoder_layers self.convlayers = nn.ModuleList([ DynamicConvEncoderLayer(args.encoder_embed_dim, args.encoder_embed_dim, args.encoder_attention_heads, args.encoder_kernel_size_list[i], input_dropout=args.conv_input_dropout, weight_dropout=args.conv_weight_dropout, dropout=args.conv_output_dropout) for i in range(self.encoder_layers) ]) self.attnlayers = nn.ModuleList([ AttentionEncoderLayer(args.encoder_embed_dim, args.encoder_attention_heads, self_attention=True, attention_dropout=args.attn_weight_dropout, dropout=args.attn_output_dropout) for _ in range(self.encoder_layers) ]) self.fflayers = nn.ModuleList([ FFLayer(args.encoder_embed_dim, args.encoder_ffn_embed_dim, relu_dropout=args.ff_relu_dropout, dropout=args.ff_output_dropout) for _ in range(self.encoder_layers) ]) self.ratios = nn.Parameter(torch.FloatTensor(self.encoder_layers, 1), requires_grad=True) self.ratios.data.fill_(0.5) # self.ratios = [nn.Parameter(torch.FloatTensor(1), requires_grad=True).cuda() for _ in range(7)] # for ratio in self.ratios: # ratio.data.fill_(0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(self.embed_dim) def forward(self, src_tokens, **unused): x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) x = x.transpose(0, 1) encoder_padding_mask = src_tokens.eq(self.padding_idx) ### I want to keep the mask anyway # if not encoder_padding_mask.any(): # encoder_padding_mask = None encoder_states = [] for i in range(self.encoder_layers): x1, state1 = self.convlayers[i]( x, encoder_padding_mask=encoder_padding_mask) x2, state2 = self.attnlayers[i]( x, encoder_padding_mask=encoder_padding_mask) if state1 is not None: encoder_states.append(state1) if state2 is not None: encoder_states.append(state2) x = x1 * self.ratios[i] + x2 * (1 - self.ratios[i]) # x = 0.5*x1 + 0.5*x2 x, _ = self.fflayers[i](x, encoder_padding_mask=encoder_padding_mask) if self.normalize: x = self.layer_norm(x) return { 'encoder_x': x, 'encoder_padding_mask': encoder_padding_mask, 'encoder_lstm_states': self.get_lstm_states(encoder_states) } def construct_encoder_layer(self, gene): if gene['type'] == 'recurrent': return LSTMEncoderLayer(**gene['param']) elif gene['type'] == 'lightconv': return LightConvEncoderLayer(**gene['param']) elif gene['type'] == 'dynamicconv': return DynamicConvEncoderLayer(**gene['param']) elif gene['type'] == 'self-attention': return AttentionEncoderLayer(**gene['param'], self_attention=True) elif gene['type'] == 'ff': return FFLayer(**gene['param']) else: raise NotImplementedError('Unknown Decoder Gene Type!') def get_lstm_states(self, encoder_states): # only return the state of the topmost lstm layer final_state = None for state in encoder_states: if state is not None and "lstm_hidden_state" in state.keys(): final_state = state["lstm_hidden_state"] return final_state def reorder_encoder_out(self, encoder_out, new_order): if encoder_out['encoder_x'] is not None: encoder_out['encoder_x'] = \ encoder_out['encoder_x'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) if encoder_out['encoder_lstm_states'] is not None: hiddens, cells = encoder_out['encoder_lstm_states'] hiddens = hiddens.index_select(1, new_order) cells = cells.index_select(1, new_order) encoder_out['encoder_lstm_states'] = (hiddens, cells) return encoder_out def max_positions(self): if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions())
class LightConvDecoder(FairseqIncrementalDecoder): """ LightConv decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`LightConvDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvDecoderLayer(args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i]) for i in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim]
class LightConvEncoder(FairseqEncoder): """ LightConv encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`LightConvEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ LightConvEncoderLayer(args, kernel_size=args.encoder_kernel_size_list[i]) for i in range(args.encoder_layers) ]) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, src_tokens, **unused): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.normalize: x = self.layer_norm(x) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions())
class TransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens, max_source_positions, encoder_layers, encoder_embed_dim, encoder_attention_heads, encoder_ffn_embed_dim,): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout if embed_tokens is None: self.padding_idx = 0 embed_dim = encoder_embed_dim else: self.padding_idx = embed_tokens.padding_idx embed_dim = embed_tokens.embedding_dim self.max_source_positions = max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(encoder_embed_dim, encoder_attention_heads, encoder_ffn_embed_dim, args) for i in range(encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.use_seg_pos_emb = getattr(args, 'use_seg_pos_emb', 0) if self.use_seg_pos_emb: self.seg_pad_idx = 2 self.seg_pos_emb = Embedding(3, embed_dim, padding_idx=self.seg_pad_idx) def forward_embedding(self, src_tokens, seg_pos=-1): # embed tokens and positions embed = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) if self.use_seg_pos_emb: masked_src_tokens = src_tokens.masked_fill(src_tokens.ne(self.padding_idx), seg_pos) masked_src_tokens = masked_src_tokens.masked_fill(src_tokens.eq(self.padding_idx), self.seg_pad_idx) x = x + self.embed_scale * self.seg_pos_emb(masked_src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) return x, embed def forward(self, src_tokens=None, cls_input=None, return_all_hiddens=False, src_encodings=None, encoder_padding_mask=None, attn_mask=None, auxilary_tokens=None): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` src_encodings (torch.FloatTensor): shape of `(T x B x C)` encoder_padding_mask (torch.Boolean): shape of '(B x T)', where paddings are True return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ if self.layer_wise_attention: return_all_hiddens = True if self.embed_tokens is not None: x, encoder_embedding = self.forward_embedding(src_tokens, seg_pos=0) aug_x, _ = self.forward_embedding(auxilary_tokens, seg_pos=1) x = torch.cat([x, aug_x], dim=1) # B x T x C -> T x B x C x = x.transpose(0, 1) src_tokens = torch.cat([src_tokens, auxilary_tokens], dim=1) # compute padding mask encoder_padding_mask = (src_tokens.eq(self.padding_idx) | src_tokens.eq(self.dictionary.bos_index)) else: assert encoder_padding_mask is not None src_tokens = encoder_padding_mask.long() encoder_embedding = None x = src_encodings if self.embed_positions is not None: x = x + self.embed_positions(src_tokens).transpose(0, 1) x = F.dropout(x, p=self.dropout, training=self.training) encoder_states = [] if return_all_hiddens else None # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask, attn_mask=attn_mask) if return_all_hiddens: encoder_states.append(x) if self.layer_norm: x = self.layer_norm(x) if return_all_hiddens: encoder_states[-1] = x return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T 'encoder_embedding': encoder_embedding, # B x T x C 'encoder_states': encoder_states, # List[T x B x C] } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) if encoder_out.get('encoder_states', None) is not None: for idx, state in enumerate(encoder_out['encoder_states']): encoder_out['encoder_states'][idx] = state.index_select(1, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu(utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms self.layers[i].upgrade_state_dict_named(state_dict, "{}.layers.{}".format(name, i)) version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions # creating constant positional encoding table with the max source positions pe = torch.zeros(self.max_source_positions, embed_dim) position = torch.arange(0, self.max_source_positions).unsqueeze(1) div_term = torch.exp((torch.arange(0, embed_dim, 2, dtype=torch.float) * -(math.log(10000.0) / embed_dim))) pe[:, 0::2] = torch.sin(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term) self.constant_positional_encoding = pe self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(args) for i in range(args.encoder_layers)] ) self.num_layers = len(self.layers) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.positional_encoding = args.positional_encoding
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.layer_wise_attention = getattr(args, "layer_wise_attention", False) # We combine the q/k/v flows of decoder self-attention with the q flow # of encoder-decoder attention. So there are four flows in total. self.proj_bias_selfattn = nn.Parameter( torch.randn(4, args.decoder_layers, embed_dim)) # The remaining k/v flows of encoder-decoder attention. self.proj_bias_encattn_kv = nn.Parameter( torch.randn(2, args.decoder_layers, embed_dim)) multiplier = 2 max_seq_len = 255 #255 or 490 self.register_buffer("time_dt", torch.tensor(0.01)) self.register_buffer("max_seq_len", torch.tensor(max_seq_len)) #self.register_buffer("max_seq_len", torch.tensor(490)) #self.register_buffer("update_flow_every", torch.tensor(100)) self.register_buffer("update_flow_every", torch.tensor(400)) self.register_buffer("k_updates", torch.tensor(0)) self.register_buffer( "cached_bias_selfattn", torch.randn(max_seq_len, 4, args.decoder_layers, embed_dim)) self.register_buffer( "cached_bias_encattn", torch.randn(max_seq_len, 2, args.decoder_layers, embed_dim)) self.proj_flow_selfattn = flow_func_linear(args.decoder_layers, multiplier, embed_dim) self.proj_flow_encattn_kv = flow_func_linear(args.decoder_layers, multiplier, embed_dim) #self.proj_flow_selfattn = id_flow(args.decoder_layers, multiplier, embed_dim) #self.proj_flow_encattn_kv = id_flow( # args.decoder_layers, multiplier, embed_dim #) self.layers = nn.ModuleList([]) self.layers.extend([ FlowTransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = (Linear( embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.ode_args = { "method": "rk4", "options": { "step_size": self.time_dt.item() / 5.0 }, } self.reset_parameters()
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). final_norm (bool, optional): apply layer norm to the output of the final decoder layer (default: True). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for layer in self.layers: x, attn = layer( x, encoder_out['encoder_out'] if encoder_out is not None else None, encoder_out['encoder_padding_mask'] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) if self.normalize: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = F.linear(x, self.embed_out) return x, {'attn': attn, 'inner_states': inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr( self, '_future_mask' ) or self._future_mask is None or self._future_mask.device != tensor.device: self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format( name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format( name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format( name, i, new, m)] = state_dict[k] del state_dict[k] if utils.item( state_dict.get('{}.version'.format(name), torch.Tensor( [1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict['{}.version'.format(name)] = torch.Tensor([1]) return state_dict
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, encoder_normalize_before: bool = False, use_bert_layer_norm: bool = False, use_gelu: bool = True, apply_bert_init: bool = False, learned_pos_embedding: bool = True, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = (PositionalEmbedding( self.max_seq_len, self.embedding_dim, self.padding_idx, learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, encoder_normalize_before=encoder_normalize_before, use_bert_layer_norm=use_bert_layer_norm, use_gelu=use_gelu, ) for _ in range(num_encoder_layers) ]) # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params)
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop : float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.embed_tokens = nn.Embedding( self.vocab_size, self.embedding_dim, self.padding_idx ) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = ( nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None ) self.embed_positions = ( PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=(self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None ) self.layers = nn.ModuleList( [ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size ) for _ in range(num_encoder_layers) ] ) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__( self, cfg, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.cfg = cfg super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) ) self.decoder_layerdrop = cfg.decoder.layerdrop self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder.embed_dim self.embed_dim = embed_dim self.output_embed_dim = cfg.decoder.output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None ) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None self.cross_self_attention = cfg.cross_self_attention if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_decoder_layer(cfg, no_encoder_attn) for _ in range(cfg.decoder.layers) ] ) self.num_layers = len(self.layers) if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = output_projection if self.output_projection is None: self.build_output_projection(cfg, dictionary, embed_tokens)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) if not args.adaptive_input and args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_decoder_layer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim ** -0.5 )
def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 12, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 12, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.0, layerdrop: float = 0.0, max_seq_len: int = 512, num_segments: int = 0, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = True, apply_bert_init: bool = True, activation_fn: str = "gelu", learned_pos_embedding: bool = True, add_bias_kv: bool = False, add_zero_attn: bool = False, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ): super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout = dropout self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.num_encoder_layers = num_encoder_layers self.num_attention_heads = num_attention_heads self.embed_tokens = nn.Embedding(self.vocab_size, self.embedding_dim, self.padding_idx) self.dense = nn.Sequential( nn.Linear(embedding_dim, embedding_dim, bias=True), nn.Tanh()) self.layer_norm = LayerNorm(embedding_dim) self.dense_lm = nn.Linear(embedding_dim, embedding_dim) self.activation_fn_lm = gelu self.layer_norm_lm = LayerNorm(embedding_dim) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = (nn.Embedding( self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None) self.embed_positions = ( PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=( self.padding_idx if offset_positions_by_padding else None), #padding_idx=None, learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None) self.layers = nn.ModuleList([ TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, # add_bias_kv=add_bias_kv, # add_zero_attn=add_zero_attn, q_noise=q_noise, qn_block_size=qn_block_size, export=export, ) for _ in range(self.num_encoder_layers) ]) #self.roberta = torch.hub.load('pytorch/fairseq', load_model) # self.roberta = RobertaModel.from_pretrained('model/roberta.base/',checkpoint_file='model.pt') # self.roberta=RobertaModel() # print(self.roberta.encode('Hello world!')) #self.score = nn.Linear(embedding_dim*2, 1, bias=True) # self.score2 = nn.Sequential(nn.Linear(embedding_dim*2, 200, bias=True),nn.Tanh()) # self.score3 = nn.Linear(200, 1, bias=True) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer])
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None #image section self.img_dim = 2048 self.text_dim = embed_dim self.L2norm = args.L2norm self.total_num_img = args.total_num_img self.per_num_img = args.per_num_img # cap2image_file = args.cap2image_file # image_embedding_file = args.image_embedding_file cap2image_file = getattr(args, "cap2image_file", "data/cap2image.pickle") image_embedding_file = getattr( args, "image_embedding_file", "features_resnet50/train-resnet50-avgpool.npy") self.cap2image = pickle.load(open(cap2image_file, "rb")) #cap_id to image_id #print("image embedding processing...") embeding_weights = np.load(image_embedding_file) img_vocab, img_dim = embeding_weights.shape embeddings_matrix = np.zeros((img_vocab + 1, img_dim)) embeddings_matrix[1:] = embeding_weights self.img_embeddings = nn.Embedding.from_pretrained( torch.FloatTensor(embeddings_matrix), freeze=args.image_emb_fix) # update embedding # self.img_embeddings.load_state_dict({'weight': embeddings_matrix}) # if args.image_emb_fix: # self.img_embeddings.weight.requires_grad = False self.merge_option = args.merge_option self.dense = nn.Linear(self.img_dim, self.text_dim) self.mergeImage = nn.Linear(self.total_num_img, 1) if self.merge_option == "att-mul-concat": self.proj_attention = SCAttention(self.text_dim, 128) self.dense2 = nn.Linear(self.text_dim, 384) elif self.merge_option == "att-concat": self.dense2 = nn.Linear(2 * self.text_dim, self.text_dim) elif self.merge_option == "att-gate": self.gate_type = args.gate_type self.proj_attention = SCAttention(self.text_dim, self.text_dim) if self.gate_type == "neural-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, self.text_dim) elif self.gate_type == "scalar-gate": self.sigmoid = nn.Sigmoid() self.gate_dense = nn.Linear(2 * self.text_dim, 1) else: self.image_weight = args.image_weight else: self.proj_attention = SCAttention(self.text_dim, self.text_dim)
def __init__(self, args, dictionary, embed_tokens, code_embed_tokens_init=None): super().__init__(args, dictionary, embed_tokens) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if code_embed_tokens_init is not None: self.code_vocab_size, self.code_dim = code_embed_tokens_init.size( 0), code_embed_tokens_init.size(1) # if args.encode_code: # self.code_embed_tokens = nn.Parameter(code_embed_tokens_init, requires_grad=not args.fix_code_book) # else: # self.code_embed_tokens = nn.Parameter(torch.normal(mean=0, std=self.code_dim ** -0.5, # size=(self.code_vocab_size, self.code_dim)), requires_grad=True) if args.encode_code: self.code_embed_tokens = Embedding( self.code_vocab_size, self.code_dim, padding_idx=None, weight=code_embed_tokens_init) else: self.code_embed_tokens = Embedding(args.codebook_size, args.encoder_embed_dim, None) if args.fix_code_book: self.code_embed_tokens.weight.requires_grad = False else: self.code_embed_tokens.weight.requires_grad = True else: self.code_embed_tokens = None self.input_form = args.input_form self.context_form = args.context_form self.context_compress = args.context_compress self.use_seg_pos_emb = getattr(args, 'use_seg_pos_emb', 0) if self.use_seg_pos_emb: self.seg_pad_idx = 2 self.seg_pos_emb = Embedding(3, embed_dim, padding_idx=self.seg_pad_idx)
def __init__( self, cfg: Wav2Vec2Seq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt( embed_dim) # todo: try with input_embed_dim self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None) self.embed_positions = (PositionalEmbedding( cfg.max_target_positions, embed_dim, padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ]) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None
def __init__(self, args, dictionary, embed_tokens, mem_len=None, ext_len=None, left_pad=False, final_norm=True): super().__init__(dictionary) self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed self.kernel_size_list = args.kernel_size_list input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # if the input embed dim coming from encoder is (Christine) self.project_in_dim = Linear( input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.mem_len = mem_len self.ext_len = 0 #Carlos 25-2-2020 ''' self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ''' self.embed_language = LanguageEmbedding( embed_dim) if args.language_embeddings else None #Christine 6-3-2020 self.n_dec_layers = args.decoder_layers self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # self.layers = nn.ModuleList([]) self.layers.extend([ #we removed all the parameters , eplace it by args and then in other forward functions, we should take care of this #Carlos 25-2-2020 RelPartialLearnableDecoderLayer(args) for _ in range(self.n_dec_layers) ]) self.project_out_dim = Linear(embed_dim, output_embed_dim, bias=False) \ if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None #???? (Christine) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=output_embed_dim**-0.5) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) # initializing these embeddings for the relative embeddings # we take these from args by Carlos self.pos_emb = PositionalEmbedding(embed_dim) #change parameters n_heads = args.decoder_attention_heads d_heads = int(embed_dim / n_heads) self.r_w_bias = nn.Parameter(torch.Tensor(n_heads, d_heads)) self.r_r_bias = nn.Parameter(torch.Tensor(n_heads, d_heads)) dropout = args.attention_dropout self.drop = nn.Dropout(dropout)
def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None
class GraphTransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens, embed_edges): super().__init__(dictionary) self.dropout = args.dropout embed_dim = embed_tokens.embedding_dim self.embed_dim = embed_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_edges = embed_edges self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.l1_gate = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.SELU(), nn.Dropout(self.dropout)) self.l2_gate = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.SELU(), nn.Dropout(self.dropout)) # self.e1_gate = nn.Linear(embed_dim*3, embed_dim) # self.e2_gate = nn.Linear(embed_dim * 3, embed_dim) self.e1_gate = nn.Sequential(nn.Linear(embed_dim*3, embed_dim), nn.SELU(), nn.Dropout(self.dropout)) self.e2_gate = nn.Sequential(nn.Linear(embed_dim*3, embed_dim), nn.SELU(), nn.Dropout(self.dropout)) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.graph_layers = nn.ModuleList([]) self.graph_layers.extend([ GraphTransformerEncoderLayer(args) for i in range(args.graph_layers) ]) self.rnn = nn.LSTM(input_size=embed_dim, hidden_size=embed_dim, batch_first=False, num_layers=1, bidirectional=True) self.register_buffer('version', torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) self.graph_layer_norm = LayerNorm(embed_dim) def forward(self, src_tokens, src_lengths, enc_edge_ids, enc_edge_links1, enc_edge_links2, graph_mask, graph_mask_rev): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) if not encoder_padding_mask.any(): encoder_padding_mask = None enc_edge_padding_mask = enc_edge_ids.eq(self.padding_idx) # embed edges enc_edge_emb = self.embed_scale * self.embed_edges(enc_edge_ids) enc_edge_emb = F.dropout(enc_edge_emb, p=self.dropout, training=self.training) # B x L x C -> L x B x C enc_edge_emb = enc_edge_emb.transpose(0, 1) # B x L -> L x B x C idx_pairs1 = enc_edge_links1.unsqueeze(-1).repeat(1, 1, self.embed_dim) idx_pairs2 = enc_edge_links2.unsqueeze(-1).repeat(1, 1, self.embed_dim) idx_pairs1 = idx_pairs1.transpose(0, 1) idx_pairs2 = idx_pairs2.transpose(0, 1) # encoder layers for layer in self.layers: x = layer(x, encoder_padding_mask) if self.normalize: x = self.layer_norm(x) # graph layers enc_states = [] enc_states.append(x) for layer in self.graph_layers: x = layer(x, self.l1_gate, self.l2_gate, self.e1_gate, self.e2_gate, idx_pairs1, idx_pairs2, encoder_padding_mask, enc_edge_padding_mask, graph_mask, graph_mask_rev, enc_edge_emb) enc_states.append(x) enc_states = torch.stack(enc_states, dim=3) enc_states = torch.transpose(torch.transpose(enc_states, 2, 3), 0, 2) bsz = enc_states.size(1) seq_len = enc_states.size(2) enc_states = enc_states.reshape(len(self.graph_layers) + 1, bsz*seq_len, self.embed_dim) outputs, state = self.rnn(enc_states) x = state[0][::2] + state[1][::2] x = x.reshape(bsz, -1, self.embed_dim) x = torch.transpose(x, 0, 1) if self.normalize: x = self.graph_layer_norm(x) return { 'encoder_out': x, # T x B x C 'encoder_padding_mask': encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out['encoder_out'] is not None: encoder_out['encoder_out'] = \ encoder_out['encoder_out'].index_select(1, new_order) if encoder_out['encoder_padding_mask'] is not None: encoder_out['encoder_padding_mask'] = \ encoder_out['encoder_padding_mask'].index_select(0, new_order) return encoder_out def reorder_encoder_input(self, encoder_input, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ # print('reorder') if encoder_input['src_tokens'] is not None: encoder_input['src_tokens'] = \ encoder_input['src_tokens'].index_select(0, new_order) # print('reorder_finish') # if encoder_input['net_input'] is not None: # encoder_input['net_input']['src_tokens'] = \ # encoder_input['net_input']['src_tokens'].index_select(0, new_order) return encoder_input def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions()) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms self.layers[i].upgrade_state_dict_named(state_dict, f"{name}.layers.{i}") version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) self.project_in_dim = Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None self.embed_positions = PositionalEmbedding( args.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.cross_self_attention = getattr(args, 'cross_self_attention', False) self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ]) self.adaptive_softmax = None self.project_out_dim = Linear(embed_dim, self.output_embed_dim, bias=False) \ if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_out = nn.Parameter(torch.Tensor(len(dictionary), self.output_embed_dim)) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim ** -0.5) if args.decoder_normalize_before and not getattr(args, 'no_decoder_final_norm', False): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, features_only=False, **extra_args ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, **extra_args ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused, ): # embed positions positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn = None inner_states = [x] for idx, layer in enumerate(self.layers): encoder_state = None if encoder_out is not None: if self.layer_wise_attention: encoder_state = encoder_out.encoder_states[idx] else: encoder_state = encoder_out.encoder_out if incremental_state is None: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = random.uniform(0, 1) if not self.training or (dropout_probability > self.decoder_layerdrop): x, attn = layer( x, encoder_state, encoder_out.encoder_padding_mask if encoder_out is not None else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {'attn': attn, 'inner_states': inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions()) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if isinstance(self.embed_positions, SinusoidalPositionalEmbedding): weights_key = '{}.embed_positions.weights'.format(name) if weights_key in state_dict: del state_dict[weights_key] state_dict['{}.embed_positions._float_tensor'.format(name)] = torch.FloatTensor(1) for i in range(len(self.layers)): # update layer norms layer_norm_map = { '0': 'self_attn_layer_norm', '1': 'encoder_attn_layer_norm', '2': 'final_layer_norm' } for old, new in layer_norm_map.items(): for m in ('weight', 'bias'): k = '{}.layers.{}.layer_norms.{}.{}'.format(name, i, old, m) if k in state_dict: state_dict['{}.layers.{}.{}.{}'.format(name, i, new, m)] = state_dict[k] del state_dict[k] version_key = '{}.version'.format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict
def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.register_buffer('version', torch.Tensor([3])) self.dropout = args.dropout self.encoder_layerdrop = args.encoder_layerdrop embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt( embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None self.layer_wise_attention = getattr(args, 'layer_wise_attention', False) self.layers = nn.ModuleList([]) self.layers.extend([ TransformerEncoderLayer(args) for i in range(args.encoder_layers) ]) self.encoder_combination = args.encoder_combination self.encoder_rezero = args.encoder_rezero if self.encoder_combination: self.weight_ffn = nn.Sequential( nn.Linear(embed_dim, args.encoder_ffn_embed_dim), nn.ReLU(), nn.Linear(args.encoder_ffn_embed_dim, embed_dim), ) if args.encoder_combination == "dynamic": self.proj = nn.ModuleList([ nn.Sequential( nn.Linear(embed_dim * args.encoder_layers, embed_dim * 2), nn.ReLU(), nn.Linear(embed_dim * 2, embed_dim), ) for _ in range(args.encoder_layers) ]) if args.encoder_combination == "linear": self.weights = nn.ParameterList([ nn.Parameter(torch.randn(1, 1, embed_dim), requires_grad=True) for _ in range(args.encoder_layers) ]) if args.encoder_rezero: self.rezero_weights = nn.ParameterList([ nn.Parameter(torch.zeros(1)) for _ in range(args.encoder_layers) ]) if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None if getattr(args, 'layernorm_embedding', False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None