def __init__( self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0, ) -> None: super().__init__() check_dimensions_match(input_dim, hidden_dim, "input_dim", "hidden_dim") self._use_positional_encoding = use_positional_encoding self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)] ) self._conv_layers = torch.nn.ModuleList() for _ in range(num_convs): padding = torch.nn.ConstantPad1d( (conv_kernel_size // 2, (conv_kernel_size - 1) // 2), 0 ) depthwise_conv = torch.nn.Conv1d( hidden_dim, hidden_dim, conv_kernel_size, groups=hidden_dim ) pointwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, 1) self._conv_layers.append( torch.nn.Sequential( padding, depthwise_conv, pointwise_conv, Activation.by_name("relu")() ) ) self.attention_norm_layer = LayerNorm(hidden_dim) self.attention_layer = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob, ) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[Activation.by_name("relu")(), Activation.by_name("linear")()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout(layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim
def __init__( self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, num_semantic_labels: int, replace_zero_semantic_labels_with_per_head_labels: bool = True, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0, semantic_integration_mode: str = "projection", semantic_emb_dim: int = 0, use_semantic_views: bool = True, multi_head_attention_batch_computation: bool = False, use_separate_label_embeddings_for_q_and_k: bool = True) -> None: super().__init__() self.return_output_meta_is_supported = True check_dimensions_match(input_dim, hidden_dim, 'input_dim', 'hidden_dim') self._use_positional_encoding = use_positional_encoding self._replace_zero_semantic_labels_with_per_head_labels = replace_zero_semantic_labels_with_per_head_labels self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)]) self._conv_layers = torch.nn.ModuleList() if semantic_integration_mode not in semantic_integration_mode_supported: raise Exception( "semantic_integration_mode must be in [{0}] but is `{1}`". format(", ".join(semantic_integration_mode_supported), semantic_integration_mode)) self._semantic_integration_mode = semantic_integration_mode self._use_separate_label_embeddings_for_q_and_k = use_separate_label_embeddings_for_q_and_k for _ in range(num_convs): padding = torch.nn.ConstantPad1d( (conv_kernel_size // 2, (conv_kernel_size - 1) // 2), 0) depthwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, conv_kernel_size, groups=hidden_dim) pointwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, 1) self._conv_layers.append( torch.nn.Sequential(padding, depthwise_conv, pointwise_conv, Activation.by_name("relu")())) self.attention_norm_layer = LayerNorm(hidden_dim) self.num_semantic_labels = num_semantic_labels self.num_attention_heads = num_attention_heads self.attention_layer = MultiHeadSemanticFlatConcatSelfAttention( num_heads=num_attention_heads, num_semantic_labels=num_semantic_labels, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob, semantic_integration_mode=semantic_integration_mode, semantic_emb_dim=semantic_emb_dim, use_semantic_views=use_semantic_views, multi_head_attention_batch_computation= multi_head_attention_batch_computation, use_separate_label_embeddings_for_q_and_k= use_separate_label_embeddings_for_q_and_k) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout( layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim