def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, model_dim: int, attention_dim: int, num_heads: int, feedforward_dim: int, dropout: float = 0.1 ) -> None: super(RelationTransformerEncoderBlock, self).__init__() self.attn = MultiHeadAttentionV2(num_heads=num_heads, u_input_dim=model_dim, v_input_dim=model_dim, attention_dim=attention_dim, output_projection_dim=model_dim, attention_dropout_prob=dropout) self.attn_dropout = torch.nn.Dropout(dropout) self.ffn = torch.nn.Sequential( torch.nn.Linear(model_dim, feedforward_dim), torch.nn.ReLU(inplace=True), torch.nn.Dropout(dropout), torch.nn.Linear(feedforward_dim, model_dim), torch.nn.Dropout(dropout) ) self.norm1 = LayerNorm(model_dim) self.norm2 = LayerNorm(model_dim)
def __init__( self, input_dim, hidden_dim, projection_dim, feedforward_hidden_dim, num_layers, num_attention_heads, use_positional_encoding=True, dropout_prob=0.2, ): super(MaskedStackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers = [] self._feedfoward_layers = [] self._layer_norm_layers = [] self._feed_forward_layer_norm_layers = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name("relu")(), Activation.by_name("linear")() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.add_module("feedforward_{i}".format(feedfoward)) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_input_dim()) self.add_module( "feedforward_layer_norm_{i}".format(feedforward_layer_norm)) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MaskedMultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, ) self.add_module("self_attention_{i}".format(self_attention)) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_input_dim()) self.add_module("layer_norm_{i}".format(layer_norm)) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = torch.nn.Dropout(dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim() self._output_layer_norm = LayerNorm(self._output_dim)
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__( self, layer: torch.nn.Module, num_layers: int, return_all_layers: bool = False ) -> None: super().__init__() self.layers = util.clone(layer, num_layers) self.norm = LayerNorm(layer.size) self.return_all_layers = return_all_layers
def __init__(self, hdim: int = 768, nlayers: int = 2, dropout_prob: int = 0.1): super(GCNNet, self).__init__() # self.gcns = nn.ModuleList([GCN(hdim, hdim, F.relu) for i in range(nlayers)]) self._gcn_layers = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim, feedforward_hidden_dim, hidden_dim = hdim, hdim, hdim for i in range(nlayers): feedfoward = FeedForward(feedfoward_input_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) gcn = GCN(hdim, hdim, F.relu) self.add_module(f"gcn_{i}", gcn) self._gcn_layers.append(gcn) layer_norm = LayerNorm(hdim) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(dropout_prob) self._input_dim = hdim self._output_dim = hdim
def __init__(self, input_dims: List[int], num_layers: int, hidden_dims: Union[int, List[int]], activations='relu'): super(GCN_layers, self).__init__() if not isinstance(hidden_dims, list): hidden_dims = [hidden_dims] * num_layers # TODO remove hard code relu activations = [torch.nn.functional.tanh] * num_layers assert len(input_dims) == len(hidden_dims) == len(activations) == num_layers gcn_layers = [] for layer_input_dim, layer_output_dim, activate in zip(input_dims, hidden_dims, activations): gcn_layers.append(GCN(layer_input_dim, layer_output_dim, activate)) self.layers = nn.ModuleList(gcn_layers) self._output_dim = hidden_dims[-1] self.input_dim = input_dims[0] self.ln = LayerNorm(hidden_dims[0]) self._mlp = FeedForward(hidden_dims[0], 1, hidden_dims[0], torch.nn.functional.sigmoid)
def __init__( self, embedding_dim: int, filters: Sequence[Sequence[int]], num_highway: int, projection_dim: int, activation: str = "relu", projection_location: str = "after_highway", do_layer_norm: bool = False, ) -> None: super().__init__() if projection_location not in _VALID_PROJECTION_LOCATIONS: raise ConfigurationError( f"unknown projection location: {projection_location}") self.input_dim = embedding_dim self.output_dim = projection_dim self._projection_location = projection_location if activation == "tanh": self._activation = torch.nn.functional.tanh elif activation == "relu": self._activation = torch.nn.functional.relu else: raise ConfigurationError(f"unknown activation {activation}") # Create the convolutions self._convolutions: List[torch.nn.Module] = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=embedding_dim, out_channels=num, kernel_size=width, bias=True) conv.weight.data.uniform_(-0.05, 0.05) conv.bias.data.fill_(0.0) self.add_module(f"char_conv_{i}", conv) # needs to match the old ELMo name self._convolutions.append(conv) # Create the highway layers num_filters = sum(num for _, num in filters) if projection_location == "after_cnn": highway_dim = projection_dim else: # highway_dim is the number of cnn filters highway_dim = num_filters self._highways = Highway(highway_dim, num_highway, activation=torch.nn.functional.relu) for highway_layer in self._highways._layers: # highway is a linear layer for each highway layer # with fused W and b weights highway_layer.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / highway_dim)) highway_layer.bias[:highway_dim].data.fill_(0.0) highway_layer.bias[highway_dim:].data.fill_(2.0) # Projection layer: always num_filters -> projection_dim self._projection = torch.nn.Linear(num_filters, projection_dim, bias=True) self._projection.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / num_filters)) self._projection.bias.data.fill_(0.0) # And add a layer norm if do_layer_norm: self._layer_norm: Callable = LayerNorm(self.output_dim) else: self._layer_norm = lambda tensor: tensor
def __init__(self, size: int, dropout: float) -> None: super().__init__() self.norm = LayerNorm(size) self.dropout = torch.nn.Dropout(dropout)
def __init__(self, layer: nn.Module, num_layers: int) -> None: super().__init__() self.layers = _clones(layer, num_layers) self.norm = LayerNorm(layer.size)
def __init__(self, layer: torch.nn.Module, num_layers: int) -> None: super().__init__() self._layers = util.clone(layer, num_layers) self._norm = LayerNorm(layer._size)
def __init__(self, input_size, output_size, type_='lstm', num_layers=1, bias=True, batch_first=True, bidirectional=True, stateful=False, dropout_input=0.0, dropout_rnn=0.0, dropout_output=0.0, layer_norm=True): super(RNN, self).__init__() #device = torch.device("cpu") self.input_size = int(input_size) self.output_size = int(output_size) self.type_ = str(type_) self.num_layers = int(num_layers) self.bias = bool(bias) self.batch_first = bool(batch_first) self.bidirectional = bool(bidirectional) self.stateful = bool(stateful) self.dropout_input = float(dropout_input) self.dropout_rnn = float(dropout_rnn) self.dropout_output = float(dropout_output) self.layer_norm = bool(layer_norm) if self.num_layers == 1: assert dropout_rnn == 0 # Input dropout self.drop_layer_input = nn.Dropout(p=dropout_input) # Define encoder type if type_ == 'lstm': encoder = torch.nn.LSTM( \ input_size = input_size, hidden_size = output_size, num_layers = num_layers, bias = bias, batch_first = batch_first, dropout = dropout_rnn, bidirectional = bidirectional) elif type_ == 'gru': encoder = torch.nn.GRU( \ input_size = input_size, hidden_size = output_size, num_layers = num_layers, bias = bias, batch_first = batch_first, dropout = dropout_rnn, bidirectional = bidirectional) else: raise ValueError("incorrect RNN type: {}".format(type_)) # Create encoder self.encoder = PytorchSeq2SeqWrapper( \ module = encoder, stateful = stateful) # Output size self.output_size = int(output_size * (1 + int(bidirectional))) # Layer normalization if self.layer_norm: self.normalization = LayerNorm(dimension=self.output_size) # Input dropout self.drop_layer_output = nn.Dropout(p=dropout_output)