def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super().__init__() self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] self._reset_gate_layers: List[FeedForward] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) reset_gate = FeedForward( feedforward_hidden_dim, activations=Activation.by_name('sigmoid')(), hidden_dims=hidden_dim, num_layers=1, dropout=dropout_prob) self.add_module(f"reset_gate_{i}", reset_gate) self._reset_gate_layers.append(reset_gate) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, num_highway_layers: int, phrase_layer: Seq2SeqEncoder, similarity_function: SimilarityFunction, multi_headed_attention_layer: MultiHeadSelfAttention, modeling_layer: Seq2SeqEncoder, span_end_encoder: Seq2SeqEncoder, dropout: float = 0.2, mask_lstms: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._highway_layer = TimeDistributed( Highway(text_field_embedder.get_output_dim(), num_highway_layers)) self._phrase_layer = phrase_layer self._matrix_attention = LegacyMatrixAttention(similarity_function) self._modeling_layer = modeling_layer self._span_end_encoder = span_end_encoder #New Self Attention layer self._self_attention_layer = multi_headed_attention_layer self._sa_matrix_attention = LegacyMatrixAttention(similarity_function) selfattent_dim = multi_headed_attention_layer.get_output_dim() #print("Self Attention Output Dim:",selfattent_dim,"\n") encoding_dim = phrase_layer.get_output_dim() modeling_dim = modeling_layer.get_output_dim() #span_start_input_dim = encoding_dim * 4 + modeling_dim span_start_input_dim = encoding_dim * 4 + modeling_dim + 2 * selfattent_dim self._span_start_predictor = TimeDistributed( torch.nn.Linear(span_start_input_dim, 1)) span_end_encoding_dim = span_end_encoder.get_output_dim() #span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim + 2 * selfattent_dim self._span_end_predictor = TimeDistributed( torch.nn.Linear(span_end_input_dim, 1)) # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily # obvious from the configuration files, so we check here. check_dimensions_match(modeling_layer.get_input_dim(), 4 * encoding_dim + 2 * selfattent_dim, "modeling layer input dim", "4 * encoding dim") check_dimensions_match(text_field_embedder.get_output_dim(), phrase_layer.get_input_dim(), "text field embedder output dim", "phrase layer input dim") check_dimensions_match( span_end_encoder.get_input_dim(), 4 * encoding_dim + 3 * modeling_dim + 2 * selfattent_dim, "span end encoder input dim", "4 * encoding dim + 3 * modeling dim") self._na_accuracy = CategoricalAccuracy() self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() self._na_dense = lambda in_dim: torch.nn.Linear(in_dim, 2).cuda() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._mask_lstms = mask_lstms initializer(self)