Exemple #1
0
 def __init__(self,
              num_chars,
              embedding_dim,
              h_size=512,
              num_layers=2,
              sequence_length=128,
              stateful=True,
              dropout_rate=0.2,
              bidirectional=False,
              use_attention=False,
              attention_size=16,
              teacher_forcing_ratio=1):
     super().__init__()
     self.teacher_forcing_ratio = teacher_forcing_ratio
     self.num_chars = num_chars
     self.embedding_dim = embedding_dim
     self.h_size = h_size
     self.num_layers = num_layers
     self.sequence_length = sequence_length
     self.embedding = Embedding(embedding_dim=256,
                                num_embeddings=num_chars,
                                sparse=False,
                                norm_type=2,
                                add_noise=True,
                                noise_intensity=0.12)
     self.lstm = LSTM(hidden_size=h_size,
                      num_layers=num_layers,
                      stateful=stateful,
                      batch_first=False,
                      dropout_rate=dropout_rate,
                      bidirectional=bidirectional,
                      use_attention=use_attention,
                      attention_size=attention_size)
     self.fc_out = Dense(num_chars, use_bias=False, activation=leaky_relu)
     self.softmax = SoftMax(axis=-1)
Exemple #2
0
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward=256,
                 dropout=0,
                 activation="relu"):

        from torch.nn.modules.activation import MultiheadAttention
        from torch.nn.modules.normalization import LayerNorm
        from torch.nn.modules.dropout import Dropout
        from torch.nn.modules.rnn import LSTM
        from torch.nn.modules.linear import Linear

        super(DPTNetBlock, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
        # Implementation of Feedforward model
        # self.linear1 = Linear(d_model, dim_feedforward)
        self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True)
        self.dropout = Dropout(dropout)
        # self.linear2 = Linear(dim_feedforward, d_model)
        self.linear2 = Linear(d_model * 2 * 2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #3
0
    def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self):
        augmented_lstm = AugmentedLstm(10, 11)
        pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True)
        # Initialize all weights to be == 1.
        initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))])
        initializer(augmented_lstm)
        initializer(pytorch_lstm)

        initial_state = torch.zeros([1, 5, 11])
        initial_memory = torch.zeros([1, 5, 11])

        # Use bigger numbers to avoid floating point instability.
        sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths)
        lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True)

        augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory))
        pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True)
        augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True)

        numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(),
                                                augmented_output_sequence.data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(),
                                                augmented_state[0].data.numpy(), decimal=4)
        numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(),
                                                augmented_state[1].data.numpy(), decimal=4)
Exemple #4
0
    def __init__(self,
                 encoder_output_dim: int,
                 decoder_input_dim: int,
                 action_embedding_dim: int,
                 input_attention: Attention,
                 sql_attention: Attention = None,
                 sql_output_dim: int = 100,
                 activation: Activation = Activation.by_name('relu')(),
                 predict_start_type_separately: bool = True,
                 num_start_types: int = None,
                 add_action_bias: bool = True,
                 dropout: float = 0.0,
                 num_layers: int = 1) -> None:
        super().__init__()
        self._input_attention = input_attention
        if sql_attention:
            self._sql_attention = sql_attention
            self._hidden_to_sql = Linear(encoder_output_dim, sql_output_dim)
        self._add_action_bias = add_action_bias
        self._activation = activation
        self._num_layers = num_layers

        self._predict_start_type_separately = predict_start_type_separately
        if predict_start_type_separately:
            self._start_type_predictor = Linear(encoder_output_dim,
                                                num_start_types)
            self._num_start_types = num_start_types
        else:
            self._start_type_predictor = None
            self._num_start_types = None

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the decoder hidden state and the previous
        # action embedding, and we'll project that down to the decoder's `input_dim`

        # [attention-based utterance; attention-based sql query; hidden state]
        self._input_projection_layer = Linear(
            decoder_input_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state. Then we concatenate those with the decoder state and project to
        # `action_embedding_dim` to make a prediction.

        self._output_projection_layer = Linear(output_dim + decoder_input_dim,
                                               action_embedding_dim)
        if self._num_layers > 1:
            self._decoder_cell = LSTM(input_dim, output_dim, self._num_layers)
        else:
            # We use a ``LSTMCell`` if we just have one layer because it is slightly faster since we are
            # just running the LSTM for one step each time.
            self._decoder_cell = LSTMCell(input_dim, output_dim)

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0,
                 activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of Feedforward model
        self.linear1 = LSTM(d_model, d_model * 2, 1, bidirectional=True)
        self.linear2 = Linear(d_model * 2 * 2, d_model)
        self.dropout = nn.Dropout(dropout)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        self.activation = _get_activation_fn(activation)

        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)
Exemple #6
0
 def __init__(self,
              in_feats,
              out_feats,
              agg_type,
              edge_feats,
              time_enc="cosine") -> None:
     super(GTCLayer, self).__init__()
     self._in_src_feats, self._in_dst_feats = in_feats, in_feats
     self._out_feats = out_feats
     self._agg_type = agg_type
     self.encode_time = TimeEncodingLayer(in_feats,
                                          in_feats,
                                          time_encoding=time_enc)
     self.fc_edge = nn.Linear(edge_feats, in_feats)
     if agg_type == "pool":
         self.fc_pool = nn.Linear(in_feats, in_feats)
     if agg_type == "lstm":
         self.lstm = LSTM(in_feats, in_feats, batch_first=True)
     if agg_type != "gcn":
         self.fc_self = nn.Linear(in_feats, out_feats)
     self.fc_neigh = nn.Linear(in_feats, out_feats)
     self.reset_parameters()
Exemple #7
0
    def __init__(self, num_embeddings, num_labels):
        super(Net, self).__init__()
        self.emb = torch.nn.Embedding(num_embeddings,
                                      Config.embedding_dim,
                                      padding_idx=0)
        self.lstm1 = LSTM(Config.embedding_dim,
                          Config.hidden_size,
                          num_layers=1,
                          batch_first=True,
                          bias=True,
                          dropout=Config.dropout,
                          bidirectional=True)
        self.lstm2 = LSTM(Config.embedding_dim,
                          Config.hidden_size,
                          num_layers=1,
                          batch_first=True,
                          bias=True,
                          dropout=Config.dropout,
                          bidirectional=True)
        self.linear = torch.nn.Linear(Config.hidden_size * 4, num_labels)
        self.loss = torch.nn.CrossEntropyLoss()
        self.pred = torch.nn.Softmax()

        self.h0 = Variable(
            torch.zeros(2, Config.batch_size, Config.hidden_size))
        self.c0 = Variable(
            torch.zeros(2, Config.batch_size, Config.hidden_size))
        self.h1 = Variable(
            torch.zeros(2, Config.batch_size, Config.hidden_size))
        self.c1 = Variable(
            torch.zeros(2, Config.batch_size, Config.hidden_size))

        if Config.cuda:
            self.h0 = self.h0.cuda()
            self.c0 = self.c0.cuda()
            self.h1 = self.h1.cuda()
            self.c1 = self.c1.cuda()
    def __init__(
        self,
        encoder_output_dim: int,
        action_embedding_dim: int,
        input_attention: Attention,
        activation: Activation = Activation.by_name("relu")(),
        add_action_bias: bool = True,
        dropout: float = 0.0,
        num_layers: int = 1,
    ) -> None:
        super().__init__()
        self._input_attention = input_attention
        self._add_action_bias = add_action_bias
        self._activation = activation
        self._num_layers = num_layers

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        output_dim = encoder_output_dim
        input_dim = output_dim
        # Our decoder input will be the concatenation of the attended encoder hidden state (i.e.,
        # the attended question encoding) and the previous action embedding, and we'll project that
        # down to the decoder's `input_dim`, which we arbitrarily set to be the same as
        # `output_dim`.
        self._input_projection_layer = Linear(
            encoder_output_dim + action_embedding_dim, input_dim)
        # Before making a prediction, we'll compute an attention over the input given our updated
        # hidden state. Then we concatenate those with the decoder state and project to
        # `action_embedding_dim` to make a prediction.
        self._output_projection_layer = Linear(output_dim + encoder_output_dim,
                                               action_embedding_dim)
        if self._num_layers > 1:
            self._decoder_cell = LSTM(input_dim, output_dim, self._num_layers)
        else:
            # We use a ``LSTMCell`` if we just have one layer because it is slightly faster since we are
            # just running the LSTM for one step each time.
            self._decoder_cell = LSTMCell(input_dim, output_dim)

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
Exemple #9
0
 def __init__(self,
              input_size: int,
              hidden_size: int = 200,
              num_layers: int = 3,
              bias: bool = True,
              batch_first: bool = True,
              dropout: float = 0,
              bidirectional: bool = False,
              pgn: Dict = None,
              **kwargs):
     super().__init__()
     if pgn is None:
         self.lstm = LSTM(input_size, hidden_size, num_layers, bias,
                          batch_first, dropout, bidirectional)
     else:
         domains, dim = pgn.pop('num_domains'), pgn.pop('domain_dim')
         self.lstm = PGLSTM(domains, dim, input_size, hidden_size,
                            num_layers, bias, batch_first, dropout,
                            bidirectional, **pgn)
     self.output_dim = hidden_size * 2 if bidirectional else hidden_size
Exemple #10
0
    def __init__(self,
                 d_model,
                 nhead,
                 hidden_size,
                 dim_feedforward,
                 dropout,
                 activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of improved part
        self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True)
        self.dropout = Dropout(dropout)
        self.linear = Linear(hidden_size * 2, d_model)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)
        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

        self.activation = _get_activation_fn(activation)
Exemple #11
0
    def __init__(self, grammar, db_dict, hidden_size, word_dim, rnn_hid,
                 start):  #, max_sentence_len):

        x_dim = 2

        super(RecursiveDecoder, self).__init__()
        assert x_dim == 2
        self.word_dim = word_dim

        _, self.word2index = get_vocabulary_word2index(grammar, db_dict)
        self.index2word = dict([[str(i), k]
                                for k, i in self.word2index.items()])
        self.term_toks = grammar.terminal_toks
        # a dictionary
        self.db_dict = db_dict
        self.x_dim = x_dim
        self.start = start
        self.possibles = get_massaged_possibles(self.word2index, grammar,
                                                self.start)
        self.longest_path = longest_num(self.possibles)
        self.rnn_eng = RNN(word_dim, rnn_hid, 1)
        self.rnn_hist = RNN(hidden_size, rnn_hid, 1, batch_first=True)
        self.rnn_add = LSTM(rnn_hid, rnn_hid, 1)
        self.hidden_size = hidden_size
        self.rnn_hid = rnn_hid
        self.embed_size = len(self.word2index)
        self.embed = nn.Embedding(self.embed_size, self.hidden_size)
        # self.V = nn.ModuleList([nn.Linear(hidden_size*2,hidden_size*2) for _ in range(hidden_size)])
        # self.W = nn.Linear(hidden_size*2,hidden_size)
        # self.V = nn.ParameterList(
        #     [nn.Parameter(torch.randn(hidden_size * 2, hidden_size * 2)) for _ in range(hidden_size)])  # Tensor
        self.in_node = nn.Linear(
            self.rnn_hid * 2, self.rnn_hid
        )  #nn.Parameter(torch.randn(hidden_size * 2, hidden_size))
        self.in_or = nn.Linear(
            self.rnn_hid * 2,
            self.rnn_hid)  #nn.Parameter(torch.randn(hidden_size * 2, 1))
        self.for_or_choice = nn.Linear(self.rnn_hid, 1)
        self.sigmoid = nn.Sigmoid()
    def __init__(self,
                 vocab: Vocabulary,
                 source_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 beam_search: Lazy[BeamSearch] = Lazy(BeamSearch),
                 attention: Attention = None,
                 target_namespace: str = "tokens",
                 target_embedding_dim: int = None,
                 scheduled_sampling_ratio: float = 0.0,
                 use_bleu: bool = True,
                 bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25,
                                                        0.25),
                 target_pretrain_file: str = None,
                 target_decoder_layers: int = 1,
                 **kwargs) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(
                bleu_ngram_weights,
                exclude_indices={
                    pad_index, self._end_index, self._start_index
                },
            )
        else:
            self._bleu = None

        # At prediction time, we'll use a beam search to find the best target sequence.
        # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as
        # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning
        deprecation_warning = (
            "The parameter {} has been deprecated."
            " Provide this parameter as argument to beam_search instead.")
        beam_search_extras = {}
        if "beam_size" in kwargs:
            beam_search_extras["beam_size"] = kwargs["beam_size"]
            warnings.warn(deprecation_warning.format("beam_size"),
                          DeprecationWarning)
        if "max_decoding_steps" in kwargs:
            beam_search_extras["max_steps"] = kwargs["max_decoding_steps"]
            warnings.warn(deprecation_warning.format("max_decoding_steps"),
                          DeprecationWarning)
        self._beam_search = beam_search.construct(end_index=self._end_index,
                                                  vocab=self.vocab,
                                                  **beam_search_extras)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        if not target_pretrain_file:
            self._target_embedder = Embedding(
                num_embeddings=num_classes, embedding_dim=target_embedding_dim)
        else:
            self._target_embedder = Embedding(
                embedding_dim=target_embedding_dim,
                pretrained_file=target_pretrain_file,
                vocab_namespace=self._target_namespace,
                vocab=self.vocab,
            )

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        attention: Attention,
        beam_size: int,
        max_decoding_steps: int,
        target_embedding_dim: int = 30,
        copy_token: str = "@COPY@",
        target_namespace: str = "target_tokens",
        tensor_based_metric: Metric = None,
        token_based_metric: Metric = None,
        initializer: InitializerApplicator = InitializerApplicator(),
        num_decoder_layers: int = 1,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._oov_index = self.vocab.get_token_index(self.vocab._oov_token,
                                                     self._target_namespace)
        self._pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                     self._target_namespace)
        self._copy_index = self.vocab.add_token_to_namespace(
            copy_token, self._target_namespace)

        self._tensor_based_metric = tensor_based_metric or BLEU(
            exclude_indices={
                self._pad_index, self._end_index, self._start_index
            })
        self._token_based_metric = token_based_metric

        self._target_vocab_size = self.vocab.get_vocab_size(
            self._target_namespace)

        self._num_decoder_layers = num_decoder_layers
        if self._num_decoder_layers > 1:
            self._has_multiple_decoder_layers = True
        else:
            self._has_multiple_decoder_layers = False

        # Encoding modules.
        self._source_embedder = source_embedder
        self._encoder = encoder

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        # We arbitrarily set the decoder's input dimension to be the same as the output dimension.
        self.encoder_output_dim = self._encoder.get_output_dim()
        self.decoder_output_dim = self.encoder_output_dim
        self.decoder_input_dim = self.decoder_output_dim

        # The decoder input will be a function of the embedding of the previous predicted token,
        # an attended encoder hidden state called the "attentive read", and another
        # weighted sum of the encoder hidden state called the "selective read".
        # While the weights for the attentive read are calculated by an `Attention` module,
        # the weights for the selective read are simply the predicted probabilities
        # corresponding to each token in the source sentence that matches the target
        # token from the previous timestep.
        self._target_embedder = Embedding(
            num_embeddings=self._target_vocab_size,
            embedding_dim=target_embedding_dim)
        self._attention = attention
        self._input_projection_layer = Linear(
            target_embedding_dim + self.encoder_output_dim * 2,
            self.decoder_input_dim)

        # We then run the projected decoder input through an LSTM cell or LSTM based on target decoder layer
        # to produce the next hidden state.
        if self._has_multiple_decoder_layers:
            # Use LSTM for multi layer
            self._decoder_cell = LSTM(self.decoder_input_dim,
                                      self.decoder_output_dim,
                                      self._num_decoder_layers)
        else:
            # Use LSTMCell for one layer because it is slightly faster
            self._decoder_cell = LSTMCell(self.decoder_input_dim,
                                          self.decoder_output_dim)

        # We create a "generation" score for each token in the target vocab
        # with a linear projection of the decoder hidden state.
        self._output_generation_layer = Linear(self.decoder_output_dim,
                                               self._target_vocab_size)

        # We create a "copying" score for each source token by applying a non-linearity
        # (tanh) to a linear projection of the encoded hidden state for that token,
        # and then taking the dot product of the result with the decoder hidden state.
        self._output_copying_layer = Linear(self.encoder_output_dim,
                                            self.decoder_output_dim)

        # At prediction time, we'll use a beam search to find the best target sequence.
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        initializer(self)
Exemple #14
0
class GTCLayer(nn.Module):
    def __init__(self,
                 in_feats,
                 out_feats,
                 agg_type,
                 edge_feats,
                 time_enc="cosine") -> None:
        super(GTCLayer, self).__init__()
        self._in_src_feats, self._in_dst_feats = in_feats, in_feats
        self._out_feats = out_feats
        self._agg_type = agg_type
        self.encode_time = TimeEncodingLayer(in_feats,
                                             in_feats,
                                             time_encoding=time_enc)
        self.fc_edge = nn.Linear(edge_feats, in_feats)
        if agg_type == "pool":
            self.fc_pool = nn.Linear(in_feats, in_feats)
        if agg_type == "lstm":
            self.lstm = LSTM(in_feats, in_feats, batch_first=True)
        if agg_type != "gcn":
            self.fc_self = nn.Linear(in_feats, out_feats)
        self.fc_neigh = nn.Linear(in_feats, out_feats)
        self.reset_parameters()

    def reset_parameters(self):
        """Reinitialize learnable parameters."""
        gain = nn.init.calculate_gain('relu')
        if self._agg_type == 'pool':
            nn.init.xavier_uniform_(self.fc_pool.weight, gain=gain)
        if self._agg_type == 'lstm':
            self.lstm.reset_parameters()
        if self._agg_type != 'gcn':
            nn.init.xavier_uniform_(self.fc_self.weight, gain=gain)
        nn.init.xavier_uniform_(self.fc_neigh.weight, gain=gain)

    def _lstm_reducer(self, edge_feat):
        """LSTM processing for temporal edges.
        """
        # (seq_len, batch_size, dim) <= (bucket_size, deg, dim)
        edge_feat = edge_feat.permute(1, 0, 2)
        batch_size = edge_feat.shape[1]
        h = (edge_feat.new_zeros((1, batch_size, self._in_src_feats)),
             edge_feat.new_zeros((1, batch_size, self._in_src_feats)))
        rst, (h_, c_) = self.lstm(edge_feat, h)
        return rst

    def forward(self, agg_graph: dgl.DGLGraph, prop_graph: dgl.DGLGraph,
                traversal_order, new_node_ids) -> torch.Tensor:
        tg = agg_graph.local_var()
        pg = prop_graph.local_var()

        nfeat = tg.ndata["nfeat"]
        # h_self = nfeat
        h_self = self.encode_time(nfeat, tg.ndata["timestamp"])
        tg.ndata["nfeat"] = h_self
        tg.edata["efeat"] = self.fc_edge(tg.edata["efeat"])
        # efeat = tg.edata["efeat"]
        # tg.apply_edges(lambda edges: {
        #     "efeat":
        #     torch.cat((edges.src["nfeat"], edges.data["efeat"]), dim=1)
        # })
        # tg.edata["efeat"] = self.encode_time(tg.edata["efeat"], tg.edata["timestamp"])
        degs = tg.ndata["degree"]

        # agg_graph aggregation
        if self._agg_type == "pool":
            tg.edata["efeat"] = F.relu(self.fc_pool(tg.edata["efeat"]))
            tg.update_all(fn.u_add_e("nfeat", "efeat", "m"),
                          fn.max("m", "neigh"))
            h_neigh = tg.ndata["neigh"]
        elif self._agg_type in ["mean", "gcn", "lstm"]:
            tg.update_all(fn.u_add_e("nfeat", "efeat", "m"),
                          fn.sum("m", "neigh"))
            h_neigh = tg.ndata["neigh"]
        else:
            raise KeyError("Aggregator type {} not recognized.".format(
                self._agg_type))

        pg.ndata["neigh"] = h_neigh
        # prop_graph propagation
        if False:
            if self._agg_type == "mean":
                pg.prop_nodes(traversal_order,
                              message_func=fn.copy_src("neigh", "tmp"),
                              reduce_func=fn.sum("tmp", "acc"))
                h_neigh = h_neigh + pg.ndata["acc"]
                h_neigh = h_neigh / degs.unsqueeze(-1)
            elif self._agg_type == "gcn":
                pg.prop_nodes(traversal_order,
                              message_func=fn.copy_src("neigh", "tmp"),
                              reduce_func=fn.sum("tmp", "acc"))
                h_neigh = h_neigh + pg.ndata["acc"]
                h_neigh = (h_self + h_neigh) / (degs.unsqueeze(-1) + 1)
            elif self._agg_type == "pool":
                pg.prop_nodes(traversal_order,
                              message_func=fn.copy_src("neigh", "tmp"),
                              reduce_func=fn.max("tmp", "acc"))
                h_neigh = torch.max(h_neigh, pg.ndata["acc"])
            elif self._agg_type == "lstm":
                h_neighs = [
                    self._lstm_reducer(h_neigh[ids]) for ids in new_node_ids
                ]
                h_neighs = torch.cat(h_neighs, dim=0)
                ridx = torch.arange(h_neighs.shape[0])
                ridx[np.concatenate(new_node_ids)] = torch.arange(
                    h_neighs.shape[0])
                h_neigh = h_neighs[ridx]
        else:
            if self._agg_type == "mean":
                h_neighs = [
                    torch.cumsum(h_neigh[ids], dim=0) for ids in new_node_ids
                ]
                h_neighs = torch.cat(h_neighs, dim=0)
                ridx = torch.arange(h_neighs.shape[0])
                ridx[np.concatenate(new_node_ids)] = torch.arange(
                    h_neighs.shape[0])
                h_neigh = h_neighs[ridx]
                h_neigh = h_neigh / degs.unsqueeze(-1)
            elif self._agg_type == "gcn":
                h_neighs = [
                    torch.cumsum(h_neigh[ids], dim=0) for ids in new_node_ids
                ]
                h_neighs = torch.cat(h_neighs, dim=0)
                ridx = torch.arange(h_neighs.shape[0])
                ridx[np.concatenate(new_node_ids)] = torch.arange(
                    h_neighs.shape[0])
                h_neigh = h_neighs[ridx]
                h_neigh = (h_self + h_neigh) / (degs.unsqueeze(-1) + 1)
            elif self._agg_type == "pool":
                h_neighs = [
                    torch.cummax(h_neigh[ids], dim=0) for ids in new_node_ids
                ]
                h_neighs = torch.cat(h_neighs, dim=0)
                ridx = torch.arange(h_neighs.shape[0])
                ridx[np.concatenate(new_node_ids)] = torch.arange(
                    h_neighs.shape[0])
                h_neigh = h_neighs[ridx]
            elif self._agg_type == "lstm":
                h_neighs = [
                    self._lstm_reducer(h_neigh[ids]) for ids in new_node_ids
                ]
                h_neighs = torch.cat(h_neighs, dim=0)
                ridx = torch.arange(h_neighs.shape[0])
                ridx[np.concatenate(new_node_ids)] = torch.arange(
                    h_neighs.shape[0])
                h_neigh = h_neighs[ridx]

        if self._agg_type == "gcn":
            rst = self.fc_neigh(h_neigh)
        else:
            rst = self.fc_self(h_self) + self.fc_neigh(h_neigh)
        return rst
class TransformerEncoderLayer(Module):
    r"""TransformerEncoderLayer is made up of self-attn and feedforward network.
    This standard encoder layer is based on the paper "Attention Is All You Need".
    Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez,
    Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in
    Neural Information Processing Systems, pages 6000-6010. Users may modify or implement
    in a different way during application.
    Args:
        d_model: the number of expected features in the input (required).
        nhead: the number of heads in the multiheadattention models (required).
        dim_feedforward: the dimension of the feedforward network model (default=2048).
        dropout: the dropout value (default=0.1).
        activation: the activation function of intermediate layer, relu or gelu (default=relu).
    Examples::
        >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
        >>> src = torch.rand(10, 32, 512)
        >>> out = encoder_layer(src)
    """
    def __init__(self,
                 d_model,
                 nhead,
                 dim_feedforward,
                 dropout=0,
                 activation="relu"):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)

        # Implementation of Feedforward model
        self.linear1 = LSTM(d_model, d_model * 2, 1, bidirectional=True)
        self.linear2 = Linear(d_model * 2 * 2, d_model)
        self.dropout = nn.Dropout(dropout)

        self.norm1 = LayerNorm(d_model)
        self.norm2 = LayerNorm(d_model)

        self.activation = _get_activation_fn(activation)

        self.dropout1 = Dropout(dropout)
        self.dropout2 = Dropout(dropout)

    def __setstate__(self, state):
        if 'activation' not in state:
            state['activation'] = F.relu
        super(TransformerEncoderLayer, self).__setstate__(state)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        ## type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor
        r"""Pass the input through the encoder layer.
        Args:
            src: the sequnce to the encoder layer (required).
            src_mask: the mask for the src sequence (optional).
            src_key_padding_mask: the mask for the src keys per batch (optional).
        Shape:
            see the docs in Transformer class.
        """
        src2 = self.self_attn(src,
                              src,
                              src,
                              attn_mask=src_mask,
                              key_padding_mask=src_key_padding_mask)[0]
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        self.linear1.flatten_parameters()
        src2 = self.linear2(self.dropout(self.activation(
            self.linear1(src)[0])))
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src
Exemple #16
0
    def __init__(
        self,
        vocab: Vocabulary,
        source_embedder: TextFieldEmbedder,
        encoder: Seq2SeqEncoder,
        max_decoding_steps: int,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "tokens",
        target_embedding_dim: int = None,
        scheduled_sampling_ratio: float = 0.0,
        use_bleu: bool = True,
        bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25),
        target_pretrain_file: str = None,
        target_decoder_layers: int = 1,
    ) -> None:
        super().__init__(vocab)
        self._target_namespace = target_namespace
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(
                bleu_ngram_weights,
                exclude_indices={
                    pad_index, self._end_index, self._start_index
                },
            )
        else:
            self._bleu = None

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        # Dense embedding of source vocab tokens.
        self._source_embedder = source_embedder

        # Encodes the sequence of source embeddings into a sequence of hidden states.
        self._encoder = encoder

        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # Dense embedding of vocab words in the target space.
        target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim(
        )
        if not target_pretrain_file:
            self._target_embedder = Embedding(
                num_embeddings=num_classes, embedding_dim=target_embedding_dim)
        else:
            self._target_embedder = Embedding(
                embedding_dim=target_embedding_dim,
                pretrained_file=target_pretrain_file,
                vocab_namespace=self._target_namespace,
                vocab=self.vocab,
            )

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = self._encoder.get_output_dim()
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        # TODO (pradeep): Do not hardcode decoder cell type.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)

        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)
Exemple #17
0
    def __init__(
        self,
        task: str,
        vocab: Vocabulary,
        input_dim: int,
        max_decoding_steps: int,
        loss_weight: float = 1.0,
        attention: Attention = None,
        beam_size: int = None,
        target_namespace: str = "target_tokens",
        target_embedding_dim: int = None,
        scheduled_sampling_ratio: float = 0.0,
        use_bleu: bool = True,
        bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25),
        target_decoder_layers: int = 1,
        **kwargs,
    ) -> None:

        super().__init__(vocab, **kwargs)

        self.task = task
        self.vocab = vocab
        self.loss_weight = loss_weight
        self._target_namespace = task + '_target_words'
        self._target_decoder_layers = target_decoder_layers
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)

        if use_bleu:
            pad_index = self.vocab.get_token_index(self.vocab._padding_token,
                                                   self._target_namespace)
            self._bleu = BLEU(bleu_ngram_weights,
                              exclude_indices={
                                  pad_index, self._end_index, self._start_index
                              })
        else:
            self._bleu = None
        self.metrics = {"bleu": self._bleu}

        # At prediction time, we use a beam search to find the most likely sequence of target tokens.
        beam_size = beam_size or 1
        self._max_decoding_steps = max_decoding_steps
        self._beam_search = BeamSearch(self._end_index,
                                       max_steps=max_decoding_steps,
                                       beam_size=beam_size)

        num_classes = self.vocab.get_vocab_size(
            namespace=self._target_namespace)

        # Attention mechanism applied to the encoder output for each step.
        self._attention = attention

        # The input to the decoder is just the previous target embedding.
        target_embedding_dim = target_embedding_dim or self._encoder_output_dim
        self._decoder_input_dim = target_embedding_dim

        # Dense embedding of vocab words in the target space.
        self._target_embedder = Embedding(num_embeddings=num_classes,
                                          embedding_dim=target_embedding_dim)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with the final hidden state of the encoder.
        self._encoder_output_dim = input_dim
        self._decoder_output_dim = self._encoder_output_dim

        if self._attention:
            # If using attention, a weighted average over encoder outputs will be concatenated
            # to the previous target embedding to form the input to the decoder at each
            # time step.
            self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim
        else:
            # Otherwise, the input to the decoder is just the previous target embedding.
            self._decoder_input_dim = target_embedding_dim

        # We'll use an LSTM cell as the recurrent cell that produces a hidden state
        # for the decoder at each time step.
        if self._target_decoder_layers > 1:
            self._decoder_cell = LSTM(
                self._decoder_input_dim,
                self._decoder_output_dim,
                self._target_decoder_layers,
            )
        else:
            self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                          self._decoder_output_dim)
        # We project the hidden state from the decoder into the output vocabulary space
        # in order to get log probabilities of each target token, at each time step.
        self._output_projection_layer = Linear(self._decoder_output_dim,
                                               num_classes)