def __init__(self, num_chars, embedding_dim, h_size=512, num_layers=2, sequence_length=128, stateful=True, dropout_rate=0.2, bidirectional=False, use_attention=False, attention_size=16, teacher_forcing_ratio=1): super().__init__() self.teacher_forcing_ratio = teacher_forcing_ratio self.num_chars = num_chars self.embedding_dim = embedding_dim self.h_size = h_size self.num_layers = num_layers self.sequence_length = sequence_length self.embedding = Embedding(embedding_dim=256, num_embeddings=num_chars, sparse=False, norm_type=2, add_noise=True, noise_intensity=0.12) self.lstm = LSTM(hidden_size=h_size, num_layers=num_layers, stateful=stateful, batch_first=False, dropout_rate=dropout_rate, bidirectional=bidirectional, use_attention=use_attention, attention_size=attention_size) self.fc_out = Dense(num_chars, use_bias=False, activation=leaky_relu) self.softmax = SoftMax(axis=-1)
def __init__(self, d_model, nhead, dim_feedforward=256, dropout=0, activation="relu"): from torch.nn.modules.activation import MultiheadAttention from torch.nn.modules.normalization import LayerNorm from torch.nn.modules.dropout import Dropout from torch.nn.modules.rnn import LSTM from torch.nn.modules.linear import Linear super(DPTNetBlock, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model # self.linear1 = Linear(d_model, dim_feedforward) self.rnn = LSTM(d_model, d_model * 2, 1, bidirectional=True) self.dropout = Dropout(dropout) # self.linear2 = Linear(dim_feedforward, d_model) self.linear2 = Linear(d_model * 2 * 2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def test_augmented_lstm_computes_same_function_as_pytorch_lstm(self): augmented_lstm = AugmentedLstm(10, 11) pytorch_lstm = LSTM(10, 11, num_layers=1, batch_first=True) # Initialize all weights to be == 1. initializer = InitializerApplicator([(".*", lambda tensor: torch.nn.init.constant_(tensor, 1.))]) initializer(augmented_lstm) initializer(pytorch_lstm) initial_state = torch.zeros([1, 5, 11]) initial_memory = torch.zeros([1, 5, 11]) # Use bigger numbers to avoid floating point instability. sorted_tensor, sorted_sequence, _, _ = sort_batch_by_length(self.random_tensor * 5., self.sequence_lengths) lstm_input = pack_padded_sequence(sorted_tensor, sorted_sequence.data.tolist(), batch_first=True) augmented_output, augmented_state = augmented_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output, pytorch_state = pytorch_lstm(lstm_input, (initial_state, initial_memory)) pytorch_output_sequence, _ = pad_packed_sequence(pytorch_output, batch_first=True) augmented_output_sequence, _ = pad_packed_sequence(augmented_output, batch_first=True) numpy.testing.assert_array_almost_equal(pytorch_output_sequence.data.numpy(), augmented_output_sequence.data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[0].data.numpy(), augmented_state[0].data.numpy(), decimal=4) numpy.testing.assert_array_almost_equal(pytorch_state[1].data.numpy(), augmented_state[1].data.numpy(), decimal=4)
def __init__(self, encoder_output_dim: int, decoder_input_dim: int, action_embedding_dim: int, input_attention: Attention, sql_attention: Attention = None, sql_output_dim: int = 100, activation: Activation = Activation.by_name('relu')(), predict_start_type_separately: bool = True, num_start_types: int = None, add_action_bias: bool = True, dropout: float = 0.0, num_layers: int = 1) -> None: super().__init__() self._input_attention = input_attention if sql_attention: self._sql_attention = sql_attention self._hidden_to_sql = Linear(encoder_output_dim, sql_output_dim) self._add_action_bias = add_action_bias self._activation = activation self._num_layers = num_layers self._predict_start_type_separately = predict_start_type_separately if predict_start_type_separately: self._start_type_predictor = Linear(encoder_output_dim, num_start_types) self._num_start_types = num_start_types else: self._start_type_predictor = None self._num_start_types = None # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the decoder hidden state and the previous # action embedding, and we'll project that down to the decoder's `input_dim` # [attention-based utterance; attention-based sql query; hidden state] self._input_projection_layer = Linear( decoder_input_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state. Then we concatenate those with the decoder state and project to # `action_embedding_dim` to make a prediction. self._output_projection_layer = Linear(output_dim + decoder_input_dim, action_embedding_dim) if self._num_layers > 1: self._decoder_cell = LSTM(input_dim, output_dim, self._num_layers) else: # We use a ``LSTMCell`` if we just have one layer because it is slightly faster since we are # just running the LSTM for one step each time. self._decoder_cell = LSTMCell(input_dim, output_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, d_model, nhead, dim_feedforward, dropout=0, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = LSTM(d_model, d_model * 2, 1, bidirectional=True) self.linear2 = Linear(d_model * 2 * 2, d_model) self.dropout = nn.Dropout(dropout) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout)
def __init__(self, in_feats, out_feats, agg_type, edge_feats, time_enc="cosine") -> None: super(GTCLayer, self).__init__() self._in_src_feats, self._in_dst_feats = in_feats, in_feats self._out_feats = out_feats self._agg_type = agg_type self.encode_time = TimeEncodingLayer(in_feats, in_feats, time_encoding=time_enc) self.fc_edge = nn.Linear(edge_feats, in_feats) if agg_type == "pool": self.fc_pool = nn.Linear(in_feats, in_feats) if agg_type == "lstm": self.lstm = LSTM(in_feats, in_feats, batch_first=True) if agg_type != "gcn": self.fc_self = nn.Linear(in_feats, out_feats) self.fc_neigh = nn.Linear(in_feats, out_feats) self.reset_parameters()
def __init__(self, num_embeddings, num_labels): super(Net, self).__init__() self.emb = torch.nn.Embedding(num_embeddings, Config.embedding_dim, padding_idx=0) self.lstm1 = LSTM(Config.embedding_dim, Config.hidden_size, num_layers=1, batch_first=True, bias=True, dropout=Config.dropout, bidirectional=True) self.lstm2 = LSTM(Config.embedding_dim, Config.hidden_size, num_layers=1, batch_first=True, bias=True, dropout=Config.dropout, bidirectional=True) self.linear = torch.nn.Linear(Config.hidden_size * 4, num_labels) self.loss = torch.nn.CrossEntropyLoss() self.pred = torch.nn.Softmax() self.h0 = Variable( torch.zeros(2, Config.batch_size, Config.hidden_size)) self.c0 = Variable( torch.zeros(2, Config.batch_size, Config.hidden_size)) self.h1 = Variable( torch.zeros(2, Config.batch_size, Config.hidden_size)) self.c1 = Variable( torch.zeros(2, Config.batch_size, Config.hidden_size)) if Config.cuda: self.h0 = self.h0.cuda() self.c0 = self.c0.cuda() self.h1 = self.h1.cuda() self.c1 = self.c1.cuda()
def __init__( self, encoder_output_dim: int, action_embedding_dim: int, input_attention: Attention, activation: Activation = Activation.by_name("relu")(), add_action_bias: bool = True, dropout: float = 0.0, num_layers: int = 1, ) -> None: super().__init__() self._input_attention = input_attention self._add_action_bias = add_action_bias self._activation = activation self._num_layers = num_layers # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. output_dim = encoder_output_dim input_dim = output_dim # Our decoder input will be the concatenation of the attended encoder hidden state (i.e., # the attended question encoding) and the previous action embedding, and we'll project that # down to the decoder's `input_dim`, which we arbitrarily set to be the same as # `output_dim`. self._input_projection_layer = Linear( encoder_output_dim + action_embedding_dim, input_dim) # Before making a prediction, we'll compute an attention over the input given our updated # hidden state. Then we concatenate those with the decoder state and project to # `action_embedding_dim` to make a prediction. self._output_projection_layer = Linear(output_dim + encoder_output_dim, action_embedding_dim) if self._num_layers > 1: self._decoder_cell = LSTM(input_dim, output_dim, self._num_layers) else: # We use a ``LSTMCell`` if we just have one layer because it is slightly faster since we are # just running the LSTM for one step each time. self._decoder_cell = LSTMCell(input_dim, output_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, input_size: int, hidden_size: int = 200, num_layers: int = 3, bias: bool = True, batch_first: bool = True, dropout: float = 0, bidirectional: bool = False, pgn: Dict = None, **kwargs): super().__init__() if pgn is None: self.lstm = LSTM(input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional) else: domains, dim = pgn.pop('num_domains'), pgn.pop('domain_dim') self.lstm = PGLSTM(domains, dim, input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional, **pgn) self.output_dim = hidden_size * 2 if bidirectional else hidden_size
def __init__(self, d_model, nhead, hidden_size, dim_feedforward, dropout, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of improved part self.lstm = LSTM(d_model, hidden_size, 1, bidirectional=True) self.dropout = Dropout(dropout) self.linear = Linear(hidden_size * 2, d_model) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) self.activation = _get_activation_fn(activation)
def __init__(self, grammar, db_dict, hidden_size, word_dim, rnn_hid, start): #, max_sentence_len): x_dim = 2 super(RecursiveDecoder, self).__init__() assert x_dim == 2 self.word_dim = word_dim _, self.word2index = get_vocabulary_word2index(grammar, db_dict) self.index2word = dict([[str(i), k] for k, i in self.word2index.items()]) self.term_toks = grammar.terminal_toks # a dictionary self.db_dict = db_dict self.x_dim = x_dim self.start = start self.possibles = get_massaged_possibles(self.word2index, grammar, self.start) self.longest_path = longest_num(self.possibles) self.rnn_eng = RNN(word_dim, rnn_hid, 1) self.rnn_hist = RNN(hidden_size, rnn_hid, 1, batch_first=True) self.rnn_add = LSTM(rnn_hid, rnn_hid, 1) self.hidden_size = hidden_size self.rnn_hid = rnn_hid self.embed_size = len(self.word2index) self.embed = nn.Embedding(self.embed_size, self.hidden_size) # self.V = nn.ModuleList([nn.Linear(hidden_size*2,hidden_size*2) for _ in range(hidden_size)]) # self.W = nn.Linear(hidden_size*2,hidden_size) # self.V = nn.ParameterList( # [nn.Parameter(torch.randn(hidden_size * 2, hidden_size * 2)) for _ in range(hidden_size)]) # Tensor self.in_node = nn.Linear( self.rnn_hid * 2, self.rnn_hid ) #nn.Parameter(torch.randn(hidden_size * 2, hidden_size)) self.in_or = nn.Linear( self.rnn_hid * 2, self.rnn_hid) #nn.Parameter(torch.randn(hidden_size * 2, 1)) self.for_or_choice = nn.Linear(self.rnn_hid, 1) self.sigmoid = nn.Sigmoid()
def __init__(self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, beam_search: Lazy[BeamSearch] = Lazy(BeamSearch), attention: Attention = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_pretrain_file: str = None, target_decoder_layers: int = 1, **kwargs) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU( bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }, ) else: self._bleu = None # At prediction time, we'll use a beam search to find the best target sequence. # For backwards compatibility, check if beam_size or max_decoding_steps were passed in as # kwargs. If so, update the BeamSearch object before constructing and raise a DeprecationWarning deprecation_warning = ( "The parameter {} has been deprecated." " Provide this parameter as argument to beam_search instead.") beam_search_extras = {} if "beam_size" in kwargs: beam_search_extras["beam_size"] = kwargs["beam_size"] warnings.warn(deprecation_warning.format("beam_size"), DeprecationWarning) if "max_decoding_steps" in kwargs: beam_search_extras["max_steps"] = kwargs["max_decoding_steps"] warnings.warn(deprecation_warning.format("max_decoding_steps"), DeprecationWarning) self._beam_search = beam_search.construct(end_index=self._end_index, vocab=self.vocab, **beam_search_extras) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) if not target_pretrain_file: self._target_embedder = Embedding( num_embeddings=num_classes, embedding_dim=target_embedding_dim) else: self._target_embedder = Embedding( embedding_dim=target_embedding_dim, pretrained_file=target_pretrain_file, vocab_namespace=self._target_namespace, vocab=self.vocab, ) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attention: Attention, beam_size: int, max_decoding_steps: int, target_embedding_dim: int = 30, copy_token: str = "@COPY@", target_namespace: str = "target_tokens", tensor_based_metric: Metric = None, token_based_metric: Metric = None, initializer: InitializerApplicator = InitializerApplicator(), num_decoder_layers: int = 1, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._oov_index = self.vocab.get_token_index(self.vocab._oov_token, self._target_namespace) self._pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._copy_index = self.vocab.add_token_to_namespace( copy_token, self._target_namespace) self._tensor_based_metric = tensor_based_metric or BLEU( exclude_indices={ self._pad_index, self._end_index, self._start_index }) self._token_based_metric = token_based_metric self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) self._num_decoder_layers = num_decoder_layers if self._num_decoder_layers > 1: self._has_multiple_decoder_layers = True else: self._has_multiple_decoder_layers = False # Encoding modules. self._source_embedder = source_embedder self._encoder = encoder # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. # We arbitrarily set the decoder's input dimension to be the same as the output dimension. self.encoder_output_dim = self._encoder.get_output_dim() self.decoder_output_dim = self.encoder_output_dim self.decoder_input_dim = self.decoder_output_dim # The decoder input will be a function of the embedding of the previous predicted token, # an attended encoder hidden state called the "attentive read", and another # weighted sum of the encoder hidden state called the "selective read". # While the weights for the attentive read are calculated by an `Attention` module, # the weights for the selective read are simply the predicted probabilities # corresponding to each token in the source sentence that matches the target # token from the previous timestep. self._target_embedder = Embedding( num_embeddings=self._target_vocab_size, embedding_dim=target_embedding_dim) self._attention = attention self._input_projection_layer = Linear( target_embedding_dim + self.encoder_output_dim * 2, self.decoder_input_dim) # We then run the projected decoder input through an LSTM cell or LSTM based on target decoder layer # to produce the next hidden state. if self._has_multiple_decoder_layers: # Use LSTM for multi layer self._decoder_cell = LSTM(self.decoder_input_dim, self.decoder_output_dim, self._num_decoder_layers) else: # Use LSTMCell for one layer because it is slightly faster self._decoder_cell = LSTMCell(self.decoder_input_dim, self.decoder_output_dim) # We create a "generation" score for each token in the target vocab # with a linear projection of the decoder hidden state. self._output_generation_layer = Linear(self.decoder_output_dim, self._target_vocab_size) # We create a "copying" score for each source token by applying a non-linearity # (tanh) to a linear projection of the encoded hidden state for that token, # and then taking the dot product of the result with the decoder hidden state. self._output_copying_layer = Linear(self.encoder_output_dim, self.decoder_output_dim) # At prediction time, we'll use a beam search to find the best target sequence. self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) initializer(self)
class GTCLayer(nn.Module): def __init__(self, in_feats, out_feats, agg_type, edge_feats, time_enc="cosine") -> None: super(GTCLayer, self).__init__() self._in_src_feats, self._in_dst_feats = in_feats, in_feats self._out_feats = out_feats self._agg_type = agg_type self.encode_time = TimeEncodingLayer(in_feats, in_feats, time_encoding=time_enc) self.fc_edge = nn.Linear(edge_feats, in_feats) if agg_type == "pool": self.fc_pool = nn.Linear(in_feats, in_feats) if agg_type == "lstm": self.lstm = LSTM(in_feats, in_feats, batch_first=True) if agg_type != "gcn": self.fc_self = nn.Linear(in_feats, out_feats) self.fc_neigh = nn.Linear(in_feats, out_feats) self.reset_parameters() def reset_parameters(self): """Reinitialize learnable parameters.""" gain = nn.init.calculate_gain('relu') if self._agg_type == 'pool': nn.init.xavier_uniform_(self.fc_pool.weight, gain=gain) if self._agg_type == 'lstm': self.lstm.reset_parameters() if self._agg_type != 'gcn': nn.init.xavier_uniform_(self.fc_self.weight, gain=gain) nn.init.xavier_uniform_(self.fc_neigh.weight, gain=gain) def _lstm_reducer(self, edge_feat): """LSTM processing for temporal edges. """ # (seq_len, batch_size, dim) <= (bucket_size, deg, dim) edge_feat = edge_feat.permute(1, 0, 2) batch_size = edge_feat.shape[1] h = (edge_feat.new_zeros((1, batch_size, self._in_src_feats)), edge_feat.new_zeros((1, batch_size, self._in_src_feats))) rst, (h_, c_) = self.lstm(edge_feat, h) return rst def forward(self, agg_graph: dgl.DGLGraph, prop_graph: dgl.DGLGraph, traversal_order, new_node_ids) -> torch.Tensor: tg = agg_graph.local_var() pg = prop_graph.local_var() nfeat = tg.ndata["nfeat"] # h_self = nfeat h_self = self.encode_time(nfeat, tg.ndata["timestamp"]) tg.ndata["nfeat"] = h_self tg.edata["efeat"] = self.fc_edge(tg.edata["efeat"]) # efeat = tg.edata["efeat"] # tg.apply_edges(lambda edges: { # "efeat": # torch.cat((edges.src["nfeat"], edges.data["efeat"]), dim=1) # }) # tg.edata["efeat"] = self.encode_time(tg.edata["efeat"], tg.edata["timestamp"]) degs = tg.ndata["degree"] # agg_graph aggregation if self._agg_type == "pool": tg.edata["efeat"] = F.relu(self.fc_pool(tg.edata["efeat"])) tg.update_all(fn.u_add_e("nfeat", "efeat", "m"), fn.max("m", "neigh")) h_neigh = tg.ndata["neigh"] elif self._agg_type in ["mean", "gcn", "lstm"]: tg.update_all(fn.u_add_e("nfeat", "efeat", "m"), fn.sum("m", "neigh")) h_neigh = tg.ndata["neigh"] else: raise KeyError("Aggregator type {} not recognized.".format( self._agg_type)) pg.ndata["neigh"] = h_neigh # prop_graph propagation if False: if self._agg_type == "mean": pg.prop_nodes(traversal_order, message_func=fn.copy_src("neigh", "tmp"), reduce_func=fn.sum("tmp", "acc")) h_neigh = h_neigh + pg.ndata["acc"] h_neigh = h_neigh / degs.unsqueeze(-1) elif self._agg_type == "gcn": pg.prop_nodes(traversal_order, message_func=fn.copy_src("neigh", "tmp"), reduce_func=fn.sum("tmp", "acc")) h_neigh = h_neigh + pg.ndata["acc"] h_neigh = (h_self + h_neigh) / (degs.unsqueeze(-1) + 1) elif self._agg_type == "pool": pg.prop_nodes(traversal_order, message_func=fn.copy_src("neigh", "tmp"), reduce_func=fn.max("tmp", "acc")) h_neigh = torch.max(h_neigh, pg.ndata["acc"]) elif self._agg_type == "lstm": h_neighs = [ self._lstm_reducer(h_neigh[ids]) for ids in new_node_ids ] h_neighs = torch.cat(h_neighs, dim=0) ridx = torch.arange(h_neighs.shape[0]) ridx[np.concatenate(new_node_ids)] = torch.arange( h_neighs.shape[0]) h_neigh = h_neighs[ridx] else: if self._agg_type == "mean": h_neighs = [ torch.cumsum(h_neigh[ids], dim=0) for ids in new_node_ids ] h_neighs = torch.cat(h_neighs, dim=0) ridx = torch.arange(h_neighs.shape[0]) ridx[np.concatenate(new_node_ids)] = torch.arange( h_neighs.shape[0]) h_neigh = h_neighs[ridx] h_neigh = h_neigh / degs.unsqueeze(-1) elif self._agg_type == "gcn": h_neighs = [ torch.cumsum(h_neigh[ids], dim=0) for ids in new_node_ids ] h_neighs = torch.cat(h_neighs, dim=0) ridx = torch.arange(h_neighs.shape[0]) ridx[np.concatenate(new_node_ids)] = torch.arange( h_neighs.shape[0]) h_neigh = h_neighs[ridx] h_neigh = (h_self + h_neigh) / (degs.unsqueeze(-1) + 1) elif self._agg_type == "pool": h_neighs = [ torch.cummax(h_neigh[ids], dim=0) for ids in new_node_ids ] h_neighs = torch.cat(h_neighs, dim=0) ridx = torch.arange(h_neighs.shape[0]) ridx[np.concatenate(new_node_ids)] = torch.arange( h_neighs.shape[0]) h_neigh = h_neighs[ridx] elif self._agg_type == "lstm": h_neighs = [ self._lstm_reducer(h_neigh[ids]) for ids in new_node_ids ] h_neighs = torch.cat(h_neighs, dim=0) ridx = torch.arange(h_neighs.shape[0]) ridx[np.concatenate(new_node_ids)] = torch.arange( h_neighs.shape[0]) h_neigh = h_neighs[ridx] if self._agg_type == "gcn": rst = self.fc_neigh(h_neigh) else: rst = self.fc_self(h_self) + self.fc_neigh(h_neigh) return rst
class TransformerEncoderLayer(Module): r"""TransformerEncoderLayer is made up of self-attn and feedforward network. This standard encoder layer is based on the paper "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in Neural Information Processing Systems, pages 6000-6010. Users may modify or implement in a different way during application. Args: d_model: the number of expected features in the input (required). nhead: the number of heads in the multiheadattention models (required). dim_feedforward: the dimension of the feedforward network model (default=2048). dropout: the dropout value (default=0.1). activation: the activation function of intermediate layer, relu or gelu (default=relu). Examples:: >>> encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8) >>> src = torch.rand(10, 32, 512) >>> out = encoder_layer(src) """ def __init__(self, d_model, nhead, dim_feedforward, dropout=0, activation="relu"): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout) # Implementation of Feedforward model self.linear1 = LSTM(d_model, d_model * 2, 1, bidirectional=True) self.linear2 = Linear(d_model * 2 * 2, d_model) self.dropout = nn.Dropout(dropout) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.activation = _get_activation_fn(activation) self.dropout1 = Dropout(dropout) self.dropout2 = Dropout(dropout) def __setstate__(self, state): if 'activation' not in state: state['activation'] = F.relu super(TransformerEncoderLayer, self).__setstate__(state) def forward(self, src, src_mask=None, src_key_padding_mask=None): ## type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tensor r"""Pass the input through the encoder layer. Args: src: the sequnce to the encoder layer (required). src_mask: the mask for the src sequence (optional). src_key_padding_mask: the mask for the src keys per batch (optional). Shape: see the docs in Transformer class. """ src2 = self.self_attn(src, src, src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] src = src + self.dropout1(src2) src = self.norm1(src) self.linear1.flatten_parameters() src2 = self.linear2(self.dropout(self.activation( self.linear1(src)[0]))) src = src + self.dropout2(src2) src = self.norm2(src) return src
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, max_decoding_steps: int, attention: Attention = None, beam_size: int = None, target_namespace: str = "tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_pretrain_file: str = None, target_decoder_layers: int = 1, ) -> None: super().__init__(vocab) self._target_namespace = target_namespace self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU( bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }, ) else: self._bleu = None # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # Dense embedding of source vocab tokens. self._source_embedder = source_embedder # Encodes the sequence of source embeddings into a sequence of hidden states. self._encoder = encoder num_classes = self.vocab.get_vocab_size(self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # Dense embedding of vocab words in the target space. target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) if not target_pretrain_file: self._target_embedder = Embedding( num_embeddings=num_classes, embedding_dim=target_embedding_dim) else: self._target_embedder = Embedding( embedding_dim=target_embedding_dim, pretrained_file=target_pretrain_file, vocab_namespace=self._target_namespace, vocab=self.vocab, ) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = self._encoder.get_output_dim() self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. # TODO (pradeep): Do not hardcode decoder cell type. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)
def __init__( self, task: str, vocab: Vocabulary, input_dim: int, max_decoding_steps: int, loss_weight: float = 1.0, attention: Attention = None, beam_size: int = None, target_namespace: str = "target_tokens", target_embedding_dim: int = None, scheduled_sampling_ratio: float = 0.0, use_bleu: bool = True, bleu_ngram_weights: Iterable[float] = (0.25, 0.25, 0.25, 0.25), target_decoder_layers: int = 1, **kwargs, ) -> None: super().__init__(vocab, **kwargs) self.task = task self.vocab = vocab self.loss_weight = loss_weight self._target_namespace = task + '_target_words' self._target_decoder_layers = target_decoder_layers self._scheduled_sampling_ratio = scheduled_sampling_ratio # We need the start symbol to provide as the input at the first timestep of decoding, and # end symbol as a way to indicate the end of the decoded sequence. self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) if use_bleu: pad_index = self.vocab.get_token_index(self.vocab._padding_token, self._target_namespace) self._bleu = BLEU(bleu_ngram_weights, exclude_indices={ pad_index, self._end_index, self._start_index }) else: self._bleu = None self.metrics = {"bleu": self._bleu} # At prediction time, we use a beam search to find the most likely sequence of target tokens. beam_size = beam_size or 1 self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) num_classes = self.vocab.get_vocab_size( namespace=self._target_namespace) # Attention mechanism applied to the encoder output for each step. self._attention = attention # The input to the decoder is just the previous target embedding. target_embedding_dim = target_embedding_dim or self._encoder_output_dim self._decoder_input_dim = target_embedding_dim # Dense embedding of vocab words in the target space. self._target_embedder = Embedding(num_embeddings=num_classes, embedding_dim=target_embedding_dim) # Decoder output dim needs to be the same as the encoder output dim since we initialize the # hidden state of the decoder with the final hidden state of the encoder. self._encoder_output_dim = input_dim self._decoder_output_dim = self._encoder_output_dim if self._attention: # If using attention, a weighted average over encoder outputs will be concatenated # to the previous target embedding to form the input to the decoder at each # time step. self._decoder_input_dim = self._decoder_output_dim + target_embedding_dim else: # Otherwise, the input to the decoder is just the previous target embedding. self._decoder_input_dim = target_embedding_dim # We'll use an LSTM cell as the recurrent cell that produces a hidden state # for the decoder at each time step. if self._target_decoder_layers > 1: self._decoder_cell = LSTM( self._decoder_input_dim, self._decoder_output_dim, self._target_decoder_layers, ) else: self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) # We project the hidden state from the decoder into the output vocabulary space # in order to get log probabilities of each target token, at each time step. self._output_projection_layer = Linear(self._decoder_output_dim, num_classes)