def __init__(self, embedding_matrix, opt): super(AEN, self).__init__() self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.squeeze_embedding = SqueezeEmbedding() self.attn_k = Attention(opt.embed_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.attn_q = Attention(opt.embed_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.ffn_c = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout) self.ffn_t = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout) self.attn_s1 = Attention(opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.dense = nn.Linear(opt.hidden_dim * 3, opt.polarities_dim)
def __init__(self, bert, opt): super(AEN_BERT, self).__init__() self.opt = opt self.bert = bert self.squeeze_embedding = SqueezeEmbedding() self.dropout = nn.Dropout(opt.dropout) self.attn_k = Attention(opt.bert_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.attn_q = Attention(opt.bert_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.ffn_c = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout) self.ffn_t = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout) self.attn_s1 = Attention(opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout) self.dense = nn.Linear(opt.hidden_dim * 3, opt.polarities_dim)
def __init__(self, embedding_matrix, opt): super(IAN, self).__init__() self.opt = opt self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float)) self.lstm_context = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True) self.lstm_aspect = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True) self.attention_aspect = Attention(opt.hidden_dim, score_function='bi_linear') self.attention_context = Attention(opt.hidden_dim, score_function='bi_linear') self.dense = nn.Linear(opt.hidden_dim*2, opt.polarities_dim)
def __init__(self, embeddings, hidden_dim, output_size, dropout_emb, dropout_lstm): """ Define the layers and initialize them. Pytorch initializes the layers by default, with random weights, sampled from certain distribution. However, in some cases you might want to explicitly initialize some layers, either by sampling from a different distribution, or by using pretrained weights (word embeddings / transfer learning) Args: """ super(AttentionalLSTM, self).__init__() # 1) embedding layer: trainable_emb = False self.word_embeddings = nn.Embedding(num_embeddings=embeddings.shape[0], embedding_dim=embeddings.shape[1]) self.init_embeddings(embeddings, trainable_emb) self.drop_emb = nn.Dropout(dropout_emb) # 2) LSTM layer self.hidden_dim = hidden_dim self.lstm = nn.LSTM(input_size=embeddings.shape[1], hidden_size=hidden_dim, batch_first=True, dropout=dropout_lstm) self.drop_lstm = nn.Dropout(dropout_lstm) self.attention = Attention(attention_size=hidden_dim, batch_first=True) # 3) linear layer -> outputs self.hidden2output = nn.Linear(hidden_dim, output_size)
def __init__(self, embedding_matrix, opt): super(MemNet, self).__init__() self.opt = opt self.embed = nn.Embedding.from_pretrained( torch.tensor(embedding_matrix, dtype=torch.float)) self.squeeze_embedding = SqueezeEmbedding(batch_first=True) self.attention = Attention(opt.embed_dim, score_function='mlp') self.x_linear = nn.Linear(opt.embed_dim, opt.embed_dim) self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)
def __init__(self, trg_n_tokens: int, embed: nn.Module, enc_size: int, rnn_size: int, rnn_layers: int, rnn_dropout: float, out_dropout: float, rnn_type: str, outputs_fn: str, tie_projections: bool, attention_fn: str, input_feeding: bool, out_non_linearity: str, out_layer_norm: bool, tau_0: float = 1, learn_tau: bool = False, length_control: bool = False, input_shortcut: bool = False, latent_dim: int = 0, **kwargs): super(AttSeqDecoder, self).__init__() assert outputs_fn in {"sum", "concat"} # ---------------------------------------------------- # Attributes # ---------------------------------------------------- self.trg_n_tokens = trg_n_tokens self.input_feeding = input_feeding self.input_shortcut = input_shortcut self.learn_tau = learn_tau self.length_control = length_control self.out_non_linearity = out_non_linearity self.out_layer_norm = out_layer_norm self.tie_projections = tie_projections self.rnn_type = rnn_type self.outputs_fn = outputs_fn self.gs_beta = 1 self.latent_dim = latent_dim # ---------------------------------------------------- # Layers # ---------------------------------------------------- self.embed = embed # the output size of the ho token: ho = [ h || c] self.o_size = self.embed.embedding_dim if tie_projections else rnn_size dec_inp_dim = self.embed.embedding_dim if self.input_feeding: dec_inp_dim += self.o_size rnn = nn.GRU if rnn_type == "GRU" else nn.LSTM self.rnn = rnn(input_size=dec_inp_dim, hidden_size=rnn_size, num_layers=rnn_layers, dropout=rnn_dropout if rnn_layers > 1 else 0., batch_first=True) self.out_dropout = nn.Dropout(out_dropout) self.attention = Attention(enc_size, rnn_size + latent_dim, method=attention_fn) # learnt temperature parameter if self.learn_tau: linear_tau = nn.Linear(rnn_size, 1, bias=False) self.softplus = nn.Sequential(linear_tau, nn.Softplus()) self.tau_0 = tau_0 # ------------------------------------------------------------- # projection layers # ------------------------------------------------------------- if self.outputs_fn == "sum": self.W_c = nn.Linear(enc_size, self.o_size) self.W_h = nn.Linear(rnn_size, self.o_size) if latent_dim > 0: self.W_z = nn.Linear(latent_dim, self.o_size) if self.input_shortcut: self.W_e = nn.Linear(dec_inp_dim, self.o_size) elif self.outputs_fn == "concat": _concat_dim = enc_size + rnn_size + latent_dim if self.input_shortcut: _concat_dim += dec_inp_dim self.W_h = nn.Linear(_concat_dim, self.o_size) self.logits = nn.Linear(self.o_size, trg_n_tokens) if self.out_layer_norm: self.norm_outs = nn.LayerNorm(self.o_size, eps=1e-6) self.init_weights() self.tie_weights()
class AttSeqDecoder(nn.Module): def __init__(self, trg_n_tokens: int, embed: nn.Module, enc_size: int, rnn_size: int, rnn_layers: int, rnn_dropout: float, out_dropout: float, rnn_type: str, outputs_fn: str, tie_projections: bool, attention_fn: str, input_feeding: bool, out_non_linearity: str, out_layer_norm: bool, tau_0: float = 1, learn_tau: bool = False, length_control: bool = False, input_shortcut: bool = False, latent_dim: int = 0, **kwargs): super(AttSeqDecoder, self).__init__() assert outputs_fn in {"sum", "concat"} # ---------------------------------------------------- # Attributes # ---------------------------------------------------- self.trg_n_tokens = trg_n_tokens self.input_feeding = input_feeding self.input_shortcut = input_shortcut self.learn_tau = learn_tau self.length_control = length_control self.out_non_linearity = out_non_linearity self.out_layer_norm = out_layer_norm self.tie_projections = tie_projections self.rnn_type = rnn_type self.outputs_fn = outputs_fn self.gs_beta = 1 self.latent_dim = latent_dim # ---------------------------------------------------- # Layers # ---------------------------------------------------- self.embed = embed # the output size of the ho token: ho = [ h || c] self.o_size = self.embed.embedding_dim if tie_projections else rnn_size dec_inp_dim = self.embed.embedding_dim if self.input_feeding: dec_inp_dim += self.o_size rnn = nn.GRU if rnn_type == "GRU" else nn.LSTM self.rnn = rnn(input_size=dec_inp_dim, hidden_size=rnn_size, num_layers=rnn_layers, dropout=rnn_dropout if rnn_layers > 1 else 0., batch_first=True) self.out_dropout = nn.Dropout(out_dropout) self.attention = Attention(enc_size, rnn_size + latent_dim, method=attention_fn) # learnt temperature parameter if self.learn_tau: linear_tau = nn.Linear(rnn_size, 1, bias=False) self.softplus = nn.Sequential(linear_tau, nn.Softplus()) self.tau_0 = tau_0 # ------------------------------------------------------------- # projection layers # ------------------------------------------------------------- if self.outputs_fn == "sum": self.W_c = nn.Linear(enc_size, self.o_size) self.W_h = nn.Linear(rnn_size, self.o_size) if latent_dim > 0: self.W_z = nn.Linear(latent_dim, self.o_size) if self.input_shortcut: self.W_e = nn.Linear(dec_inp_dim, self.o_size) elif self.outputs_fn == "concat": _concat_dim = enc_size + rnn_size + latent_dim if self.input_shortcut: _concat_dim += dec_inp_dim self.W_h = nn.Linear(_concat_dim, self.o_size) self.logits = nn.Linear(self.o_size, trg_n_tokens) if self.out_layer_norm: self.norm_outs = nn.LayerNorm(self.o_size, eps=1e-6) self.init_weights() self.tie_weights() def init_weights(self): pass def tie_weights(self): if self.tie_projections: self.logits.weight = self.embed.embedding.weight # self.embed.embedding.weight = self.logits.weight def _step_emb(self, step, trg, logits, sampling_prob, sampling_mode, tau): """ Get the token embedding for the current timestep. Possible options: - select the embedding by a given index - sample a token from a probability distribution and embed - construct a "fuzzy" embedding, by taking a convex combination of all the token embeddings, parameterized by a probability distribution Note: At the last timestep (step == max_length), when by definition there is not a target word that is given to us, generation a distribution from the logits regardless of whether the model is trained with teacher-forcing, scheduled-sampling and/or gumbel-softmax. """ batch, max_length = trg.size() if sampling_prob == 1 or coin_flip(sampling_prob) or step == max_length: assert sampling_mode in ["argmax", "gs", "st", "gs-st", "softmax"] # get the argmax if sampling_mode == "argmax": maxv, maxi = logits.max(dim=2) e_i = self.embed(maxi) return e_i, id2dist(maxi, self.logits.out_features).unsqueeze(1) # get the expected embedding, parameterized by the posterior elif sampling_mode in ["gs", "st", "gs-st", "softmax"]: # add gumbel noise only during training _add_gumbel = self.training and sampling_mode in ["gs", "gs-st"] # discretize the distributions if Straight-Trough is used hard = sampling_mode in ["st", "gs-st"] # make sure not to generate <pad>, # because it's a zero embedding and we'll get no gradients. pad_mask = torch.zeros_like(logits) pad_mask[:, :, 0] = torch.min(logits) logits = logits + pad_mask dist = relax_softmax(logits, tau, gumbel=_add_gumbel, hard=hard, beta=self.gs_beta) e_i = self.embed.expectation(dist) return e_i, dist else: raise NotImplementedError else: w_i = trg[:, step].unsqueeze(1) e_i = self.embed(w_i) return e_i, id2dist(w_i, self.logits.out_features).unsqueeze(1) def _step_input(self, embeddings, input_feed=None, tick=None): """ Create the input to the decoder for a given step """ batch = embeddings.size(0) _input = embeddings if self.input_feeding: if input_feed is None: with torch.no_grad(): input_feed = torch.zeros(batch, 1, self.o_size, dtype=_input.dtype, device=_input.device) _input = torch.cat((embeddings, input_feed), -1) if self.length_control: _input = torch.cat((_input, tick), -1) return _input def _step(self, inp, enc_outputs, state, src_mask, latent=None): """ Perform one decoding step. 1. Feed the input to the decoder and obtain the contextualized token representations. 2. Generate a context vector. It is a convex combination of the states of the encoder, the weights of which are a function of each state of the encoder and the current state of the decoder. 3. Re-weight the decoder's state with the context vector. 4. Project the context-aware vector to the vocabulary. """ # 1. Feed the input to the decoder outputs, state = self.rnn(inp, state) # 2. Generate the context vector query = outputs.squeeze(1) if latent is not None: query = torch.cat([query, latent], 1) contexts, att_scores = self.attention(enc_outputs, query, src_mask) # apply dropout before combining the features outputs = self.out_dropout(outputs) contexts = self.out_dropout(contexts) # 3. Re-weight the decoder's state with the context vector. if self.outputs_fn == "sum": o = self.W_h(outputs) + self.W_c(contexts) if self.input_shortcut: o = o + self.W_e(inp) if self.latent_dim > 0: o = o + self.W_z(latent) elif self.outputs_fn == "concat": o = torch.cat([outputs, contexts], 2) if self.input_shortcut: o = torch.cat([o, inp], 2) if self.latent_dim > 0: o = torch.cat([o, latent.unsqueeze(1)], 2) o = self.W_h(o) else: raise ValueError if self.out_layer_norm: o = self.norm_outs(o) if self.out_non_linearity == "relu": o = torch.relu(o) elif self.out_non_linearity == "tanh": o = torch.tanh(o) # 4. Project the context-aware vector to the vocabulary. logits = self.logits(o) return logits, outputs, state, o, att_scores def forward(self, trg: Tensor, enc_outputs: Tensor, init_hidden, enc_lengths: Tensor, src_mask: Tensor, trg_lengths: Tensor = None, word_dropout=0, sampling=0.0, sampling_mode="argmax", tau=1.0, lm: nn.Module = None, fusion=None, fusion_a=None, lm_state=None, latent=None, **kwargs): """ Returns: Note: dists contain one less element than logits, because we do not care about sampling from the last timestep as it will not be used for sampling another token. The last timestep should correspond to the EOS token, and the corresponding logit will be used only for computing the NLL loss of the EOS token. When the decoder is used for supervised learning you can use the given x_i tokens (teacher-forcing). However, you can also sample the next token (scheduled-sampling), or even approximate it with the gumbel-relaxation and back-propagate through the sample. For unsupervised learning you may need the distributions p_i that generated the samples. dists p_0 ~─┐ p_1 ~── ... p_n (sampling) ~ │ ~ ... ~ logits u_0 │ u_1 ... u_n ↑ │ ↑ ... ↑ c_0 │ c_1 ... c_n ↑ │ ↑ ... ↑ outputs h_0 │ h_1 ... h_n ↑ │ ↑ ... ↑ e_0 └─> e_1 ... ──> e_n ↑ ↑ ↑ x_0 x_1 x_n (<s>) """ results = {"logits": [], "outputs": [], "attention": [], "dists": [], "gate": [], "tau": [], "samples": [], "dec": [], "lm": []} batch, max_length = trg.size() # ------------------------------------------------------------------ # Prepare Decoding # ------------------------------------------------------------------ # initial hidden state of the decoder, and initial context state_i = init_hidden context_i, o_i, tick = None, None, None # if we are doing inference, then simply take the argmax if self.training is False: sampling_mode = "argmax" # pre-compute source state projections for efficiency if self.attention.method in ["general", "additive"]: self.attention.precompute_enc_projections(enc_outputs) if self.length_control: countdown = length_countdown(trg_lengths).float() * self.W_tick ratio = trg_lengths.float() / enc_lengths.float() # At the first step (step==0) select the embedding of the given # target word (usually the <sos> token). e_i, d_i = self._step_emb(step=0, trg=trg, logits=None, sampling_prob=0, sampling_mode="argmax", tau=1) # ------------------------------------------------------------------ # Decoding Loop # ------------------------------------------------------------------ for i in range(max_length): # --------------------------------------------------------- # 1. construct time-step input # --------------------------------------------------------- if i > 0: e_i = F.dropout2d(e_i, word_dropout, self.training) # the number of remaining tokens if self.length_control: tick = torch.stack((countdown[:, i], ratio), -1).unsqueeze(1) input_i = self._step_input(e_i, o_i, tick) # --------------------------------------------------------- # 2. perform one decoding step # --------------------------------------------------------- step_i = self._step(input_i, enc_outputs, state_i, src_mask, latent) logits_i, outs_i, state_i, o_i, att_i = step_i # --------------------------------------------------------- # 3. obtain the NEXT input word embedding # --------------------------------------------------------- # feed the input to the prior and interpolate with the decoder if fusion is not None: assert lm is not None _len = torch.ones(batch, device=e_i.device, dtype=torch.long) lm_outs = lm(d_i.max(dim=2)[1], _len, lm_state) lm_logits, lm_state = lm_outs["logits"], lm_outs["hidden"] results["lm"].append(lm_logits) results["dec"].append(logits_i) logits_i = lm_fusion(logits_i, lm_logits, fusion, fusion_a) # generation the temperature value for the next sampling step if self.learn_tau and self.training: tau = 1 / (self.softplus(outs_i.squeeze()) + self.tau_0) results["tau"].append(tau) # select or sample the next input e_i, d_i = self._step_emb(step=i + 1, trg=trg, logits=logits_i, sampling_prob=sampling, sampling_mode=sampling_mode, tau=tau) # --------------------------------------------------------- results["logits"].append(logits_i) results["outputs"].append(outs_i) results["attention"].append(att_i.unsqueeze(1)) results["dists"].append(d_i) results["samples"].append(e_i) results = {k: torch.cat(v, dim=1) if len(v) > 0 else None for k, v in results.items()} return results
def __init__(self, trg_ntokens, enc_size, **kwargs): super(AttSeqDecoder, self).__init__() ############################################ # Attributes ############################################ self.trg_ntokens = trg_ntokens emb_size = kwargs.get("emb_size", 100) embed_noise = kwargs.get("embed_noise", .0) embed_dropout = kwargs.get("embed_dropout", .0) rnn_size = kwargs.get("rnn_size", 100) rnn_layers = kwargs.get("rnn_layers", 1) rnn_dropout = kwargs.get("rnn_dropout", .0) tie_weights = kwargs.get("tie_weights", False) attention_fn = kwargs.get("attention_fn", "general") self.input_feeding = kwargs.get("input_feeding", False) self.learn_tau = kwargs.get("learn_tau", False) self.length_control = kwargs.get("length_control", False) self.gumbel = kwargs.get("gumbel", False) self.out_non_linearity = kwargs.get("out_non_linearity", None) self.layer_norm = kwargs.get("layer_norm", None) self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False) ############################################ # Layers ############################################ self.embed = Embed(trg_ntokens, emb_size, noise=embed_noise, dropout=embed_dropout) # the output size of the ho token: ho = [ h || c] if tie_weights: self.ho_size = emb_size else: self.ho_size = rnn_size dec_input_size = emb_size if self.input_feeding: dec_input_size += self.ho_size if self.length_control: dec_input_size += 2 # length scaling parameter self.W_tick = nn.Parameter(torch.rand(1)) self.rnn = nn.LSTM(input_size=dec_input_size, hidden_size=rnn_size, num_layers=rnn_layers, batch_first=True) self.rnn_dropout = nn.Dropout(rnn_dropout) self.attention = Attention(enc_size, rnn_size, method=attention_fn) # learnt temperature parameter if self.learn_tau: self.softplus = nn.Sequential( nn.Linear(self.ho_size, 1, bias=False), nn.Softplus()) self.tau_0 = kwargs.get("tau_0", 1) # initial input feeding if self.input_feeding_learnt: self.Wi = nn.Linear(enc_size, self.ho_size) # source context-aware output projection self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size) # projection layer to the vocabulary self.Wo = nn.Linear(self.ho_size, trg_ntokens) if self.layer_norm: self.norm_ctx = nn.LayerNorm(self.ho_size) if self.input_feeding_learnt: self.norm_input_feed = nn.LayerNorm(self.ho_size) if tie_weights: # if rnn_size != emb_size: # raise ValueError("if `tie_weights` is True," # "emb_size has to be equal to rnn_size") self.Wo.weight = self.embed.embedding.weight