Ejemplo n.º 1
0
 def __init__(self,
              out_embed_dims,
              vocab_size,
              vocab_reduction_module=None):
     super().__init__(out_embed_dims, vocab_size, vocab_reduction_module)
     self.weight_projection = Linear(sum(out_embed_dims),
                                     len(out_embed_dims),
                                     bias=True)
Ejemplo n.º 2
0
    def __init__(self, decoder_hidden_state_dim, context_dim, **kwargs):
        super().__init__(decoder_hidden_state_dim, context_dim)

        self.context_dim = context_dim
        self.attention_dim = kwargs.get("attention_dim", context_dim)
        # W_ae and b_a
        self.encoder_proj = Linear(
            context_dim, self.attention_dim, bias=True
        )
        # W_ad
        self.decoder_proj = Linear(
            decoder_hidden_state_dim, self.attention_dim, bias=False
        )
        # V_a
        self.to_scores = Linear(
            self.attention_dim, 1, bias=False
        )
        self.src_length_masking = kwargs.get("src_length_masking", True)
Ejemplo n.º 3
0
 def __init__(self,
              out_embed_dims,
              vocab_size,
              vocab_reduction_module=None):
     super().__init__(out_embed_dims, vocab_size, vocab_reduction_module)
     dim = out_embed_dims[0]
     self.bottleneck = Linear(sum(out_embed_dims), dim)
     self.output_projection = OutputProjection(dim, vocab_size,
                                               vocab_reduction_module)
Ejemplo n.º 4
0
    def __init__(self, decoder_hidden_state_dim, encoder_output_dim, **kwargs):
        super().__init__(decoder_hidden_state_dim, encoder_output_dim)

        self.input_proj = None
        force_projection = kwargs.get("force_projection", False)
        if force_projection or decoder_hidden_state_dim != encoder_output_dim:
            self.input_proj = Linear(
                decoder_hidden_state_dim, encoder_output_dim, bias=True
            )
        self.src_length_masking = kwargs.get("src_length_masking", True)
Ejemplo n.º 5
0
    def __init__(
        self,
        out_embed_dims,
        vocab_size,
        vocab_reduction_module=None,
        fixed_weights=None,
        hidden_layer_size=32,
        activation_fn=torch.nn.ReLU,
        norm_fn=torch.exp,
    ):
        """Initializes a combination strategy with explicit weights.

        Args:
            out_embed_dims (list): List of output dimensionalities of the
                decoders.
            vocab_size (int): Size of the output projection.
            vocab_reduction_module: For vocabulary reduction
            fixed_weights (list): If not None, use these fixed weights rather
                than a gating network.
            hidden_layer_size (int): Size of the hidden layer of the gating
                network.
            activation_fn: Non-linearity at the hidden layer.
            norm_fn: Function to use for normalization (exp or sigmoid).
        """
        super().__init__(out_embed_dims, vocab_size, vocab_reduction_module)
        if fixed_weights is None:
            self.fixed_weights = None
            self.gating_network = nn.Sequential(
                Linear(sum(out_embed_dims), hidden_layer_size, bias=True),
                activation_fn(),
                Linear(hidden_layer_size, len(out_embed_dims), bias=True),
            )
            self.norm_fn = norm_fn
        else:
            assert len(fixed_weights) == len(out_embed_dims)
            self.fixed_weights = maybe_cuda(
                torch.Tensor(fixed_weights).view(1, 1, -1))
Ejemplo n.º 6
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        n=4,
        encoder_hidden_dim=512,
        embed_dim=512,
        freeze_embed=False,
        hidden_dim=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="dot",
        residual_level=None,
        activation_fn=nn.ReLU,
    ):
        super().__init__(dst_dict)
        self.history_len = n - 1
        self.encoder_hidden_dim = encoder_hidden_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.out_embed_dim = out_embed_dim
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.attention_type = attention_type
        self.residual_level = residual_level
        self.dst_dict = dst_dict
        self.activation_fn = activation_fn

        num_embeddings = len(dst_dict)
        padding_idx = dst_dict.pad()
        self.embed_tokens = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embed_dim,
            padding_idx=padding_idx,
            freeze_embed=freeze_embed,
        )

        self.history_conv = nn.Sequential(
            torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len),
            activation_fn())

        self.hidden_dim = hidden_dim
        self.layers = nn.ModuleList([
            NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn)
            for _ in range(num_layers)
        ])

        self.attention = attention.build_attention(
            attention_type=attention_type,
            decoder_hidden_state_dim=hidden_dim,
            encoder_output_dim=encoder_hidden_dim,
            force_projection=True,
        )
        self.combined_output_and_context_dim = (
            self.attention.encoder_output_dim + hidden_dim)
        if self.combined_output_and_context_dim != out_embed_dim:
            self.additional_fc = Linear(self.combined_output_and_context_dim,
                                        out_embed_dim)

        self.output_projection_w = nn.Parameter(
            torch.FloatTensor(num_embeddings,
                              out_embed_dim).uniform_(-0.1, 0.1))
        self.output_projection_b = nn.Parameter(
            torch.FloatTensor(num_embeddings).zero_())
Ejemplo n.º 7
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        encoder_hidden_dim=512,
        embed_dim=512,
        freeze_embed=False,
        hidden_dim=512,
        out_embed_dim=512,
        cell_type="lstm",
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="dot",
        residual_level=None,
        averaging_encoder=False,
        project_output=True,
        tie_embeddings=False,
        pretrained_embed=None,
        projection_pretrained_embed=None,
    ):
        super().__init__(
            src_dict,
            dst_dict,
            vocab_reduction_params,
            out_embed_dim,
            project_output=project_output,
            pretrained_embed=projection_pretrained_embed,
        )
        encoder_hidden_dim = max(1, encoder_hidden_dim)
        self.encoder_hidden_dim = encoder_hidden_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.out_embed_dim = out_embed_dim
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.attention_type = attention_type
        self.residual_level = residual_level
        self.tie_embeddings = tie_embeddings

        num_embeddings = len(dst_dict)
        padding_idx = dst_dict.pad()
        self.embed_tokens = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embed_dim,
            padding_idx=padding_idx,
            freeze_embed=freeze_embed,
        )
        if self.tie_embeddings:
            assert self.embed_dim == self.out_embed_dim, (
                "Input embeddings and output projections must have the same "
                "dimension for the weights to be tied")
            self.embed_tokens.weight = self.output_projection_w
        else:
            pytorch_translate_utils.load_embedding(
                embedding=self.embed_tokens,
                dictionary=dst_dict,
                pretrained_embed=pretrained_embed,
            )

        self.hidden_dim = hidden_dim
        self.averaging_encoder = averaging_encoder

        if cell_type == "lstm":
            cell_class = rnn_cell.LSTMCell
        elif cell_type == "milstm":
            cell_class = rnn_cell.MILSTMCell
        elif cell_type == "layer_norm_lstm":
            cell_class = rnn_cell.LayerNormLSTMCell

        if hidden_dim != encoder_hidden_dim:
            hidden_init_fc_list = []
            cell_init_fc_list = []
            for _ in range(num_layers):
                hidden_init_fc_list.append(
                    Linear(encoder_hidden_dim, hidden_dim))
                cell_init_fc_list.append(Linear(encoder_hidden_dim,
                                                hidden_dim))
            self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list)
            self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list)

        self.attention = attention.build_attention(
            attention_type=attention_type,
            decoder_hidden_state_dim=hidden_dim,
            context_dim=encoder_hidden_dim,
        )
        if self.attention.context_dim:
            self.initial_attn_context = nn.Parameter(
                torch.Tensor(self.attention.context_dim).zero_())
        self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim

        layers = []
        for layer in range(num_layers):
            if layer == 0:
                cell_input_dim = embed_dim + self.attention.context_dim
            else:
                cell_input_dim = hidden_dim
            layers.append(
                cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim))
        self.layers = nn.ModuleList(layers)

        if self.combined_output_and_context_dim != out_embed_dim:
            self.additional_fc = Linear(self.combined_output_and_context_dim,
                                        out_embed_dim)
Ejemplo n.º 8
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        encoder_hidden_dim=512,
        embed_dim=512,
        freeze_embed=False,
        hidden_dim=512,
        out_embed_dim=512,
        cell_type="lstm",
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="dot",
        residual_level=None,
        averaging_encoder=False,
    ):
        super().__init__(dst_dict)
        self.encoder_hidden_dim = encoder_hidden_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.out_embed_dim = out_embed_dim
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.attention_type = attention_type
        self.residual_level = residual_level

        num_embeddings = len(dst_dict)
        padding_idx = dst_dict.pad()
        self.embed_tokens = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embed_dim,
            padding_idx=padding_idx,
            freeze_embed=freeze_embed,
        )
        self.hidden_dim = hidden_dim
        self.averaging_encoder = averaging_encoder

        if cell_type == "lstm":
            cell_class = rnn_cell.LSTMCell
        elif cell_type == "milstm":
            cell_class = rnn_cell.MILSTMCell
        elif cell_type == "layer_norm_lstm":
            cell_class = rnn_cell.LayerNormLSTMCell

        if hidden_dim != encoder_hidden_dim:
            hidden_init_fc_list = []
            cell_init_fc_list = []
            for _ in range(num_layers):
                hidden_init_fc_list.append(
                    Linear(encoder_hidden_dim, hidden_dim))
                cell_init_fc_list.append(Linear(encoder_hidden_dim,
                                                hidden_dim))
            self.hidden_init_fc_list = nn.ModuleList(hidden_init_fc_list)
            self.cell_init_fc_list = nn.ModuleList(cell_init_fc_list)
        self.initial_attn_context = nn.Parameter(
            torch.Tensor(encoder_hidden_dim).zero_(), )

        if attention_type is not None:
            self.attention = attention.build_attention(
                attention_type=attention_type,
                decoder_hidden_state_dim=hidden_dim,
                encoder_output_dim=encoder_hidden_dim,
            )
            self.combined_output_and_context_dim = encoder_hidden_dim + hidden_dim
        else:
            self.attention = None
            self.combined_output_and_context_dim = hidden_dim

        layers = []
        for layer in range(num_layers):
            if layer == 0:
                if self.attention is not None:
                    cell_input_dim = encoder_hidden_dim + embed_dim
                else:
                    cell_input_dim = embed_dim
            else:
                cell_input_dim = hidden_dim
            layers.append(
                cell_class(input_dim=cell_input_dim, hidden_dim=hidden_dim))
        self.layers = nn.ModuleList(layers)

        if self.combined_output_and_context_dim != out_embed_dim:
            self.additional_fc = Linear(self.combined_output_and_context_dim,
                                        out_embed_dim)

        self.vocab_reduction_module = None
        if vocab_reduction_params:
            self.vocab_reduction_module = vocab_reduction.VocabReduction(
                src_dict, dst_dict, vocab_reduction_params)

        self.output_projection_w = nn.Parameter(
            torch.FloatTensor(num_embeddings,
                              out_embed_dim).uniform_(-0.1, 0.1))
        self.output_projection_b = nn.Parameter(
            torch.FloatTensor(num_embeddings).zero_())
Ejemplo n.º 9
0
    def __init__(
        self,
        src_dict,
        dst_dict,
        vocab_reduction_params=None,
        n=4,
        encoder_hidden_dim=512,
        embed_dim=512,
        freeze_embed=False,
        hidden_dim=512,
        out_embed_dim=512,
        num_layers=1,
        dropout_in=0.1,
        dropout_out=0.1,
        attention_type="dot",
        residual_level=None,
        activation_fn=nn.ReLU,
        project_output=True,
        pretrained_embed=None,
        projection_pretrained_embed=None,
    ):
        super().__init__(
            src_dict,
            dst_dict,
            vocab_reduction_params,
            out_embed_dim,
            project_output=project_output,
            pretrained_embed=projection_pretrained_embed,
        )
        self.history_len = n - 1
        self.encoder_hidden_dim = encoder_hidden_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.out_embed_dim = out_embed_dim
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.attention_type = attention_type
        self.residual_level = residual_level
        self.dst_dict = dst_dict
        self.activation_fn = activation_fn

        num_embeddings = len(dst_dict)
        padding_idx = dst_dict.pad()
        self.embed_tokens = Embedding(
            num_embeddings=num_embeddings,
            embedding_dim=embed_dim,
            padding_idx=padding_idx,
            freeze_embed=freeze_embed,
        )
        pytorch_translate_utils.load_embedding(
            embedding=self.embed_tokens,
            dictionary=dst_dict,
            pretrained_embed=pretrained_embed,
        )

        self.history_conv = nn.Sequential(
            torch.nn.Conv1d(embed_dim, hidden_dim, self.history_len),
            activation_fn())

        self.hidden_dim = hidden_dim
        self.layers = nn.ModuleList([
            NonlinearLayer(hidden_dim, hidden_dim, activation_fn=activation_fn)
            for _ in range(num_layers)
        ])

        self.attention = attention.build_attention(
            attention_type=attention_type,
            decoder_hidden_state_dim=hidden_dim,
            context_dim=encoder_hidden_dim,
            force_projection=True,
        )
        self.combined_output_and_context_dim = self.attention.context_dim + hidden_dim
        if self.combined_output_and_context_dim != out_embed_dim:
            self.additional_fc = Linear(self.combined_output_and_context_dim,
                                        out_embed_dim)