def __init__( self, input_dim, hidden_dim, projection_dim, feedforward_hidden_dim, num_layers, num_attention_heads, use_positional_encoding=True, dropout_prob=0.2, ): super(MaskedStackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers = [] self._feedfoward_layers = [] self._layer_norm_layers = [] self._feed_forward_layer_norm_layers = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name("relu")(), Activation.by_name("linear")() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.add_module("feedforward_{i}".format(feedfoward)) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_input_dim()) self.add_module( "feedforward_layer_norm_{i}".format(feedforward_layer_norm)) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MaskedMultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, ) self.add_module("self_attention_{i}".format(self_attention)) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_input_dim()) self.add_module("layer_norm_{i}".format(layer_norm)) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = torch.nn.Dropout(dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim() self._output_layer_norm = LayerNorm(self._output_dim)
def __init__(self, input_dim: int, summary_dim: int, feedforward: FeedForward): super().__init__() self.input_dim = input_dim self.summary_dim = summary_dim self.feedforward = feedforward # Make sure that the input dimension matches the input/stack. assert input_dim + summary_dim == feedforward.get_input_dim()
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.2) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_input_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_input_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim() self._output_layer_norm = LayerNorm(self._output_dim)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, source_encoder: Seq2SeqEncoder, max_decoding_steps: int, dialog_acts_encoder: FeedForward = None, attention: Attention = None, attention_function: SimilarityFunction = None, n_dialog_acts: int = None, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, use_dialog_acts: bool = True, regularizers: Optional[RegularizerApplicator] = None, ) -> None: super().__init__(vocab, regularizers) self._target_namespace = target_namespace self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first # timestep of decoding, and end symbol as a way to indicate the end # of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU(exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None # At prediction time, we use a beam search to find the most # likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source (Facts) vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._source_encoder = source_encoder if use_dialog_acts: # Dense embedding of dialog acts. da_embedding_dim = dialog_acts_encoder.get_input_dim() self._dialog_acts_embedder = EmbeddingBag(n_dialog_acts, da_embedding_dim) # Encodes dialog acts self._dialog_acts_encoder = dialog_acts_encoder else: self._dialog_acts_embedder = None self._dialog_acts_encoder = None num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. if attention: if attention_function: raise ConfigurationError( "You can only specify an attention module or an " "attention function, but not both.") self._attention = attention elif attention_function: self._attention = LegacyAttention(attention_function) else: self._attention = None # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._target_embedder = Embedding(num_classes, target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim # since we initialize the hidden state of the decoder with the final # hidden state of the encoder. self._encoder_output_dim = self._source_encoder.get_output_dim() if use_dialog_acts: self._merge_encoder = Sequential( Linear( self._source_encoder.get_output_dim() + self._dialog_acts_encoder.get_output_dim(), self._encoder_output_dim, )) self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will # be concatenated to the previous target embedding to form the input # to the decoder at each time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__(self, feed_forward: FeedForward) -> None: super(TimeDistributedFeedForwardEncoder, self).__init__() self._input_dim = feed_forward.get_input_dim() self._output_dim = feed_forward.get_output_dim() self._time_distributed_fnn = TimeDistributed(feed_forward)