Example #1
0
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 projection_dim: int,
                 feedforward_hidden_dim: int,
                 num_layers: int,
                 num_attention_heads: int,
                 use_positional_encoding: bool = True,
                 dropout_prob: float = 0.1,
                 residual_dropout_prob: float = 0.2,
                 attention_dropout_prob: float = 0.1) -> None:
        super(StackedSelfAttentionEncoder, self).__init__()

        self._use_positional_encoding = use_positional_encoding
        self._attention_layers: List[MultiHeadSelfAttention] = []
        self._feedfoward_layers: List[FeedForward] = []
        self._layer_norm_layers: List[LayerNorm] = []
        self._feed_forward_layer_norm_layers: List[LayerNorm] = []

        feedfoward_input_dim = input_dim
        for i in range(num_layers):
            feedfoward = FeedForward(
                feedfoward_input_dim,
                activations=[
                    Activation.by_name('relu')(),
                    Activation.by_name('linear')()
                ],
                hidden_dims=[feedforward_hidden_dim, hidden_dim],
                num_layers=2,
                dropout=dropout_prob)

            # Note: Please use `ModuleList` in new code. It provides better
            # support for running on multiple GPUs. We've kept `add_module` here
            # solely for backwards compatibility with existing serialized models.
            self.add_module(f"feedforward_{i}", feedfoward)
            self._feedfoward_layers.append(feedfoward)

            feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim())
            self.add_module(f"feedforward_layer_norm_{i}",
                            feedforward_layer_norm)
            self._feed_forward_layer_norm_layers.append(feedforward_layer_norm)

            self_attention = MultiHeadSelfAttention(
                num_heads=num_attention_heads,
                input_dim=hidden_dim,
                attention_dim=projection_dim,
                values_dim=projection_dim,
                attention_dropout_prob=attention_dropout_prob)
            self.add_module(f"self_attention_{i}", self_attention)
            self._attention_layers.append(self_attention)

            layer_norm = LayerNorm(self_attention.get_output_dim())
            self.add_module(f"layer_norm_{i}", layer_norm)
            self._layer_norm_layers.append(layer_norm)

            feedfoward_input_dim = hidden_dim

        self.dropout = Dropout(residual_dropout_prob)
        self._input_dim = input_dim
        self._output_dim = self._attention_layers[-1].get_output_dim()
Example #2
0
    def __init__(self,
                 model_dim: int,
                 attention_dim: int,
                 num_heads: int,
                 feedforward_dim: int,
                 dropout: float = 0.1
                 ) -> None:
        super(RelationTransformerEncoderBlock, self).__init__()

        self.attn = MultiHeadAttentionV2(num_heads=num_heads,
                                         u_input_dim=model_dim,
                                         v_input_dim=model_dim,
                                         attention_dim=attention_dim,
                                         output_projection_dim=model_dim,
                                         attention_dropout_prob=dropout)
        self.attn_dropout = torch.nn.Dropout(dropout)

        self.ffn = torch.nn.Sequential(
            torch.nn.Linear(model_dim, feedforward_dim),
            torch.nn.ReLU(inplace=True),
            torch.nn.Dropout(dropout),
            torch.nn.Linear(feedforward_dim, model_dim),
            torch.nn.Dropout(dropout)
        )

        self.norm1 = LayerNorm(model_dim)
        self.norm2 = LayerNorm(model_dim)
Example #3
0
    def __init__(
        self,
        input_dim,
        hidden_dim,
        projection_dim,
        feedforward_hidden_dim,
        num_layers,
        num_attention_heads,
        use_positional_encoding=True,
        dropout_prob=0.2,
    ):
        super(MaskedStackedSelfAttentionEncoder, self).__init__()

        self._use_positional_encoding = use_positional_encoding
        self._attention_layers = []
        self._feedfoward_layers = []
        self._layer_norm_layers = []
        self._feed_forward_layer_norm_layers = []

        feedfoward_input_dim = input_dim
        for i in range(num_layers):
            feedfoward = FeedForward(
                feedfoward_input_dim,
                activations=[
                    Activation.by_name("relu")(),
                    Activation.by_name("linear")()
                ],
                hidden_dims=[feedforward_hidden_dim, hidden_dim],
                num_layers=2,
                dropout=dropout_prob,
            )

            self.add_module("feedforward_{i}".format(feedfoward))
            self._feedfoward_layers.append(feedfoward)

            feedforward_layer_norm = LayerNorm(feedfoward.get_input_dim())
            self.add_module(
                "feedforward_layer_norm_{i}".format(feedforward_layer_norm))
            self._feed_forward_layer_norm_layers.append(feedforward_layer_norm)

            self_attention = MaskedMultiHeadSelfAttention(
                num_heads=num_attention_heads,
                input_dim=hidden_dim,
                attention_dim=projection_dim,
                values_dim=projection_dim,
            )
            self.add_module("self_attention_{i}".format(self_attention))
            self._attention_layers.append(self_attention)

            layer_norm = LayerNorm(self_attention.get_input_dim())
            self.add_module("layer_norm_{i}".format(layer_norm))
            self._layer_norm_layers.append(layer_norm)

            feedfoward_input_dim = hidden_dim

        self.dropout = torch.nn.Dropout(dropout_prob)
        self._input_dim = input_dim
        self._output_dim = self._attention_layers[-1].get_output_dim()
        self._output_layer_norm = LayerNorm(self._output_dim)
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 projection_dim: int,
                 feedforward_hidden_dim: int,
                 num_layers: int,
                 num_attention_heads: int,
                 use_positional_encoding: bool = True,
                 dropout_prob: float = 0.1,
                 residual_dropout_prob: float = 0.2,
                 attention_dropout_prob: float = 0.1) -> None:
        super(StackedSelfAttentionEncoder, self).__init__()

        self._use_positional_encoding = use_positional_encoding
        self._attention_layers: List[MultiHeadSelfAttention] = []
        self._feedfoward_layers: List[FeedForward] = []
        self._layer_norm_layers: List[LayerNorm] = []
        self._feed_forward_layer_norm_layers: List[LayerNorm] = []

        feedfoward_input_dim = input_dim
        for i in range(num_layers):
            feedfoward = FeedForward(
                feedfoward_input_dim,
                activations=[
                    Activation.by_name('relu')(),
                    Activation.by_name('linear')()
                ],
                hidden_dims=[feedforward_hidden_dim, hidden_dim],
                num_layers=2,
                dropout=dropout_prob)

            self.add_module(f"feedforward_{i}", feedfoward)
            self._feedfoward_layers.append(feedfoward)

            feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim())
            self.add_module(f"feedforward_layer_norm_{i}",
                            feedforward_layer_norm)
            self._feed_forward_layer_norm_layers.append(feedforward_layer_norm)

            self_attention = MultiHeadSelfAttention(
                num_heads=num_attention_heads,
                input_dim=hidden_dim,
                attention_dim=projection_dim,
                values_dim=projection_dim,
                attention_dropout_prob=attention_dropout_prob)
            self.add_module(f"self_attention_{i}", self_attention)
            self._attention_layers.append(self_attention)

            layer_norm = LayerNorm(self_attention.get_output_dim())
            self.add_module(f"layer_norm_{i}", layer_norm)
            self._layer_norm_layers.append(layer_norm)

            feedfoward_input_dim = hidden_dim

        self.dropout = Dropout(residual_dropout_prob)
        self._input_dim = input_dim
        self._output_dim = self._attention_layers[-1].get_output_dim()
Example #5
0
 def __init__(
     self, layer: torch.nn.Module, num_layers: int, return_all_layers: bool = False
 ) -> None:
     super().__init__()
     self.layers = util.clone(layer, num_layers)
     self.norm = LayerNorm(layer.size)
     self.return_all_layers = return_all_layers
Example #6
0
    def __init__(self, hdim: int = 768, nlayers: int = 2, dropout_prob: int = 0.1):
        super(GCNNet, self).__init__()
        # self.gcns = nn.ModuleList([GCN(hdim, hdim, F.relu) for i in range(nlayers)])
        self._gcn_layers = []
        self._feedfoward_layers: List[FeedForward] = []
        self._layer_norm_layers: List[LayerNorm] = []
        self._feed_forward_layer_norm_layers: List[LayerNorm] = []
        feedfoward_input_dim, feedforward_hidden_dim, hidden_dim = hdim, hdim, hdim
        for i in range(nlayers):
            feedfoward = FeedForward(feedfoward_input_dim,
                                     activations=[Activation.by_name('relu')(),
                                                  Activation.by_name('linear')()],
                                     hidden_dims=[feedforward_hidden_dim, hidden_dim],
                                     num_layers=2,
                                     dropout=dropout_prob)

            # Note: Please use `ModuleList` in new code. It provides better
            # support for running on multiple GPUs. We've kept `add_module` here
            # solely for backwards compatibility with existing serialized models.
            self.add_module(f"feedforward_{i}", feedfoward)
            self._feedfoward_layers.append(feedfoward)

            feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim())
            self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm)
            self._feed_forward_layer_norm_layers.append(feedforward_layer_norm)

            gcn = GCN(hdim, hdim, F.relu)
            self.add_module(f"gcn_{i}", gcn)
            self._gcn_layers.append(gcn)

            layer_norm = LayerNorm(hdim)
            self.add_module(f"layer_norm_{i}", layer_norm)
            self._layer_norm_layers.append(layer_norm)

            feedfoward_input_dim = hidden_dim

        self.dropout = Dropout(dropout_prob)
        self._input_dim = hdim
        self._output_dim = hdim
Example #7
0
 def __init__(self, input_dims: List[int],
              num_layers: int,
              hidden_dims: Union[int, List[int]],
              activations='relu'):
     super(GCN_layers, self).__init__()
     if not isinstance(hidden_dims, list):
         hidden_dims = [hidden_dims] * num_layers
     # TODO remove hard code relu
     activations = [torch.nn.functional.tanh] * num_layers
     assert len(input_dims) == len(hidden_dims) == len(activations) == num_layers
     gcn_layers = []
     for layer_input_dim, layer_output_dim, activate in zip(input_dims, hidden_dims, activations):
         gcn_layers.append(GCN(layer_input_dim, layer_output_dim, activate))
     self.layers = nn.ModuleList(gcn_layers)
     self._output_dim = hidden_dims[-1]
     self.input_dim = input_dims[0]
     self.ln = LayerNorm(hidden_dims[0])
     self._mlp = FeedForward(hidden_dims[0], 1, hidden_dims[0], torch.nn.functional.sigmoid)
Example #8
0
    def __init__(
        self,
        embedding_dim: int,
        filters: Sequence[Sequence[int]],
        num_highway: int,
        projection_dim: int,
        activation: str = "relu",
        projection_location: str = "after_highway",
        do_layer_norm: bool = False,
    ) -> None:
        super().__init__()

        if projection_location not in _VALID_PROJECTION_LOCATIONS:
            raise ConfigurationError(
                f"unknown projection location: {projection_location}")

        self.input_dim = embedding_dim
        self.output_dim = projection_dim
        self._projection_location = projection_location

        if activation == "tanh":
            self._activation = torch.nn.functional.tanh
        elif activation == "relu":
            self._activation = torch.nn.functional.relu
        else:
            raise ConfigurationError(f"unknown activation {activation}")

        # Create the convolutions
        self._convolutions: List[torch.nn.Module] = []
        for i, (width, num) in enumerate(filters):
            conv = torch.nn.Conv1d(in_channels=embedding_dim,
                                   out_channels=num,
                                   kernel_size=width,
                                   bias=True)
            conv.weight.data.uniform_(-0.05, 0.05)
            conv.bias.data.fill_(0.0)
            self.add_module(f"char_conv_{i}",
                            conv)  # needs to match the old ELMo name
            self._convolutions.append(conv)

        # Create the highway layers
        num_filters = sum(num for _, num in filters)
        if projection_location == "after_cnn":
            highway_dim = projection_dim
        else:
            # highway_dim is the number of cnn filters
            highway_dim = num_filters
        self._highways = Highway(highway_dim,
                                 num_highway,
                                 activation=torch.nn.functional.relu)
        for highway_layer in self._highways._layers:
            # highway is a linear layer for each highway layer
            # with fused W and b weights
            highway_layer.weight.data.normal_(mean=0.0,
                                              std=np.sqrt(1.0 / highway_dim))
            highway_layer.bias[:highway_dim].data.fill_(0.0)
            highway_layer.bias[highway_dim:].data.fill_(2.0)

        # Projection layer: always num_filters -> projection_dim
        self._projection = torch.nn.Linear(num_filters,
                                           projection_dim,
                                           bias=True)
        self._projection.weight.data.normal_(mean=0.0,
                                             std=np.sqrt(1.0 / num_filters))
        self._projection.bias.data.fill_(0.0)

        # And add a layer norm
        if do_layer_norm:
            self._layer_norm: Callable = LayerNorm(self.output_dim)
        else:
            self._layer_norm = lambda tensor: tensor
 def __init__(self, size: int, dropout: float) -> None:
     super().__init__()
     self.norm = LayerNorm(size)
     self.dropout = torch.nn.Dropout(dropout)
Example #10
0
 def __init__(self, layer: nn.Module, num_layers: int) -> None:
     super().__init__()
     self.layers = _clones(layer, num_layers)
     self.norm = LayerNorm(layer.size)
 def __init__(self, layer: torch.nn.Module, num_layers: int) -> None:
     super().__init__()
     self._layers = util.clone(layer, num_layers)
     self._norm = LayerNorm(layer._size)
Example #12
0
    def __init__(self,
                 input_size,
                 output_size,
                 type_='lstm',
                 num_layers=1,
                 bias=True,
                 batch_first=True,
                 bidirectional=True,
                 stateful=False,
                 dropout_input=0.0,
                 dropout_rnn=0.0,
                 dropout_output=0.0,
                 layer_norm=True):
        super(RNN, self).__init__()

        #device = torch.device("cpu")

        self.input_size = int(input_size)
        self.output_size = int(output_size)
        self.type_ = str(type_)
        self.num_layers = int(num_layers)
        self.bias = bool(bias)
        self.batch_first = bool(batch_first)
        self.bidirectional = bool(bidirectional)
        self.stateful = bool(stateful)
        self.dropout_input = float(dropout_input)
        self.dropout_rnn = float(dropout_rnn)
        self.dropout_output = float(dropout_output)
        self.layer_norm = bool(layer_norm)

        if self.num_layers == 1:
            assert dropout_rnn == 0

        # Input dropout
        self.drop_layer_input = nn.Dropout(p=dropout_input)

        # Define encoder type
        if type_ == 'lstm':
            encoder = torch.nn.LSTM( \
                        input_size = input_size,
                        hidden_size = output_size,
                        num_layers = num_layers,
                        bias = bias,
                        batch_first = batch_first,
                        dropout = dropout_rnn,
                        bidirectional = bidirectional)
        elif type_ == 'gru':
            encoder = torch.nn.GRU( \
                        input_size = input_size,
                        hidden_size = output_size,
                        num_layers = num_layers,
                        bias = bias,
                        batch_first = batch_first,
                        dropout = dropout_rnn,
                        bidirectional = bidirectional)

        else:
            raise ValueError("incorrect RNN type: {}".format(type_))

        # Create encoder
        self.encoder = PytorchSeq2SeqWrapper( \
                                    module = encoder,
                                    stateful = stateful)

        # Output size
        self.output_size = int(output_size * (1 + int(bidirectional)))

        # Layer normalization
        if self.layer_norm:
            self.normalization = LayerNorm(dimension=self.output_size)

        # Input dropout
        self.drop_layer_output = nn.Dropout(p=dropout_output)