def __init__(self, out_embed_dims, vocab_size, vocab_reduction_module=None): super().__init__(out_embed_dims, vocab_size, vocab_reduction_module) self.weight_projection = Linear(sum(out_embed_dims), len(out_embed_dims), bias=True)
def __init__(self, decoder_hidden_state_dim, context_dim, **kwargs): super().__init__(decoder_hidden_state_dim, context_dim) self.context_dim = context_dim self.attention_dim = kwargs.get("attention_dim", context_dim) # W_ae and b_a self.encoder_proj = Linear( context_dim, self.attention_dim, bias=True ) # W_ad self.decoder_proj = Linear( decoder_hidden_state_dim, self.attention_dim, bias=False ) # V_a self.to_scores = Linear( self.attention_dim, 1, bias=False ) self.src_length_masking = kwargs.get("src_length_masking", True)
def __init__(self, out_embed_dims, vocab_size, vocab_reduction_module=None): super().__init__(out_embed_dims, vocab_size, vocab_reduction_module) dim = out_embed_dims[0] self.bottleneck = Linear(sum(out_embed_dims), dim) self.output_projection = OutputProjection(dim, vocab_size, vocab_reduction_module)
def __init__(self, decoder_hidden_state_dim, encoder_output_dim, **kwargs): super().__init__(decoder_hidden_state_dim, encoder_output_dim) self.input_proj = None force_projection = kwargs.get("force_projection", False) if force_projection or decoder_hidden_state_dim != encoder_output_dim: self.input_proj = Linear( decoder_hidden_state_dim, encoder_output_dim, bias=True ) self.src_length_masking = kwargs.get("src_length_masking", True)
def __init__( self, out_embed_dims, vocab_size, vocab_reduction_module=None, fixed_weights=None, hidden_layer_size=32, activation_fn=torch.nn.ReLU, norm_fn=torch.exp, ): """Initializes a combination strategy with explicit weights. Args: out_embed_dims (list): List of output dimensionalities of the decoders. vocab_size (int): Size of the output projection. vocab_reduction_module: For vocabulary reduction fixed_weights (list): If not None, use these fixed weights rather than a gating network. hidden_layer_size (int): Size of the hidden layer of the gating network. activation_fn: Non-linearity at the hidden layer. norm_fn: Function to use for normalization (exp or sigmoid). """ super().__init__(out_embed_dims, vocab_size, vocab_reduction_module) if fixed_weights is None: self.fixed_weights = None self.gating_network = nn.Sequential( Linear(sum(out_embed_dims), hidden_layer_size, bias=True), activation_fn(), Linear(hidden_layer_size, len(out_embed_dims), bias=True), ) self.norm_fn = norm_fn else: assert len(fixed_weights) == len(out_embed_dims) self.fixed_weights = maybe_cuda( torch.Tensor(fixed_weights).view(1, 1, -1))
def __init__( self, src_dict, dst_dict, n=4, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, activation_fn=nn.ReLU, ): super().__init__(dst_dict) self.history_len = n - 1 self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.dst_dict = dst_dict self.activation_fn = activation_fn num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) self.history_conv = nn.Sequential( torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len), activation_fn()) self.hidden_dim = hidden_dim self.layers = nn.ModuleList([ NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn) for _ in range(num_layers) ]) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, encoder_output_dim=encoder_hidden_dim, force_projection=True, ) self.combined_output_and_context_dim = ( self.attention.encoder_output_dim + hidden_dim) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim) self.output_projection_w = nn.Parameter( torch.FloatTensor(num_embeddings, out_embed_dim).uniform_(-0.1, 0.1)) self.output_projection_b = nn.Parameter( torch.FloatTensor(num_embeddings).zero_())
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, cell_type="lstm", num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, averaging_encoder=False, project_output=True, tie_embeddings=False, pretrained_embed=None, projection_pretrained_embed=None, ): super().__init__( src_dict, dst_dict, vocab_reduction_params, out_embed_dim, project_output=project_output, pretrained_embed=projection_pretrained_embed, ) encoder_hidden_dim = max(1, encoder_hidden_dim) self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.tie_embeddings = tie_embeddings num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) if self.tie_embeddings: assert self.embed_dim == self.out_embed_dim, ( "Input embeddings and output projections must have the same " "dimension for the weights to be tied") self.embed_tokens.weight = self.output_projection_w else: pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dst_dict, pretrained_embed=pretrained_embed, ) self.hidden_dim = hidden_dim self.averaging_encoder = averaging_encoder if cell_type == "lstm": cell_class = rnn_cell.LSTMCell elif cell_type == "milstm": cell_class = rnn_cell.MILSTMCell elif cell_type == "layer_norm_lstm": cell_class = rnn_cell.LayerNormLSTMCell if hidden_dim != encoder_hidden_dim: hidden_init_fc_list = [] cell_init_fc_list = [] for _ in range(num_layers): hidden_init_fc_list.append( Linear(encoder_hidden_dim, hidden_dim)) cell_init_fc_list.append(Linear(encoder_hidden_dim, hidden_dim)) self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list) self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, context_dim=encoder_hidden_dim, ) if self.attention.context_dim: self.initial_attn_context = nn.Parameter( torch.Tensor(self.attention.context_dim).zero_()) self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim layers = [] for layer in range(num_layers): if layer == 0: cell_input_dim = embed_dim + self.attention.context_dim else: cell_input_dim = hidden_dim layers.append( cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim)) self.layers = nn.ModuleList(layers) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim)
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, cell_type="lstm", num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, averaging_encoder=False, ): super().__init__(dst_dict) self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) self.hidden_dim = hidden_dim self.averaging_encoder = averaging_encoder if cell_type == "lstm": cell_class = rnn_cell.LSTMCell elif cell_type == "milstm": cell_class = rnn_cell.MILSTMCell elif cell_type == "layer_norm_lstm": cell_class = rnn_cell.LayerNormLSTMCell if hidden_dim != encoder_hidden_dim: hidden_init_fc_list = [] cell_init_fc_list = [] for _ in range(num_layers): hidden_init_fc_list.append( Linear(encoder_hidden_dim, hidden_dim)) cell_init_fc_list.append(Linear(encoder_hidden_dim, hidden_dim)) self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list) self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list) self.initial_attn_context = nn.Parameter( torch.Tensor(encoder_hidden_dim).zero_(), ) if attention_type is not None: self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, encoder_output_dim=encoder_hidden_dim, ) self.combined_output_and_context_dim = encoder_hidden_dim + hidden_dim else: self.attention = None self.combined_output_and_context_dim = hidden_dim layers = [] for layer in range(num_layers): if layer == 0: if self.attention is not None: cell_input_dim = encoder_hidden_dim + embed_dim else: cell_input_dim = embed_dim else: cell_input_dim = hidden_dim layers.append( cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim)) self.layers = nn.ModuleList(layers) if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim) self.vocab_reduction_module = None if vocab_reduction_params: self.vocab_reduction_module = vocab_reduction.VocabReduction( src_dict, dst_dict, vocab_reduction_params) self.output_projection_w = nn.Parameter( torch.FloatTensor(num_embeddings, out_embed_dim).uniform_(-0.1, 0.1)) self.output_projection_b = nn.Parameter( torch.FloatTensor(num_embeddings).zero_())
def __init__( self, src_dict, dst_dict, vocab_reduction_params=None, n=4, encoder_hidden_dim=512, embed_dim=512, freeze_embed=False, hidden_dim=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention_type="dot", residual_level=None, activation_fn=nn.ReLU, project_output=True, pretrained_embed=None, projection_pretrained_embed=None, ): super().__init__( src_dict, dst_dict, vocab_reduction_params, out_embed_dim, project_output=project_output, pretrained_embed=projection_pretrained_embed, ) self.history_len = n - 1 self.encoder_hidden_dim = encoder_hidden_dim self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.out_embed_dim = out_embed_dim self.dropout_in = dropout_in self.dropout_out = dropout_out self.attention_type = attention_type self.residual_level = residual_level self.dst_dict = dst_dict self.activation_fn = activation_fn num_embeddings = len(dst_dict) padding_idx = dst_dict.pad() self.embed_tokens = Embedding( num_embeddings=num_embeddings, embedding_dim=embed_dim, padding_idx=padding_idx, freeze_embed=freeze_embed, ) pytorch_translate_utils.load_embedding( embedding=self.embed_tokens, dictionary=dst_dict, pretrained_embed=pretrained_embed, ) self.history_conv = nn.Sequential( torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len), activation_fn()) self.hidden_dim = hidden_dim self.layers = nn.ModuleList([ NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn) for _ in range(num_layers) ]) self.attention = attention.build_attention( attention_type=attention_type, decoder_hidden_state_dim=hidden_dim, context_dim=encoder_hidden_dim, force_projection=True, ) self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim if self.combined_output_and_context_dim != out_embed_dim: self.additional_fc = Linear(self.combined_output_and_context_dim, out_embed_dim)