Python Attention Examples, modules.layers.Attention Python Examples

Example #1

0

Show file

File: aen.py Project: mohit-sh/LC-ABSA

    def __init__(self, embedding_matrix, opt):
        super(AEN, self).__init__()
        self.opt = opt
        self.embed = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float))
        self.squeeze_embedding = SqueezeEmbedding()

        self.attn_k = Attention(opt.embed_dim,
                                out_dim=opt.hidden_dim,
                                n_head=8,
                                score_function='mlp',
                                dropout=opt.dropout)
        self.attn_q = Attention(opt.embed_dim,
                                out_dim=opt.hidden_dim,
                                n_head=8,
                                score_function='mlp',
                                dropout=opt.dropout)
        self.ffn_c = PositionwiseFeedForward(opt.hidden_dim,
                                             dropout=opt.dropout)
        self.ffn_t = PositionwiseFeedForward(opt.hidden_dim,
                                             dropout=opt.dropout)

        self.attn_s1 = Attention(opt.hidden_dim,
                                 n_head=8,
                                 score_function='mlp',
                                 dropout=opt.dropout)

        self.dense = nn.Linear(opt.hidden_dim * 3, opt.polarities_dim)

Example #2

0

Show file

File: aen.py Project: mohit-sh/LC-ABSA

    def __init__(self, bert, opt):
        super(AEN_BERT, self).__init__()
        self.opt = opt
        self.bert = bert
        self.squeeze_embedding = SqueezeEmbedding()
        self.dropout = nn.Dropout(opt.dropout)

        self.attn_k = Attention(opt.bert_dim,
                                out_dim=opt.hidden_dim,
                                n_head=8,
                                score_function='mlp',
                                dropout=opt.dropout)
        self.attn_q = Attention(opt.bert_dim,
                                out_dim=opt.hidden_dim,
                                n_head=8,
                                score_function='mlp',
                                dropout=opt.dropout)
        self.ffn_c = PositionwiseFeedForward(opt.hidden_dim,
                                             dropout=opt.dropout)
        self.ffn_t = PositionwiseFeedForward(opt.hidden_dim,
                                             dropout=opt.dropout)

        self.attn_s1 = Attention(opt.hidden_dim,
                                 n_head=8,
                                 score_function='mlp',
                                 dropout=opt.dropout)

        self.dense = nn.Linear(opt.hidden_dim * 3, opt.polarities_dim)

Example #3

0

Show file

File: ian.py Project: mohit-sh/LC-ABSA

 def __init__(self, embedding_matrix, opt):
     super(IAN, self).__init__()
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(torch.tensor(embedding_matrix, dtype=torch.float))
     self.lstm_context = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True)
     self.lstm_aspect = DynamicLSTM(opt.embed_dim, opt.hidden_dim, num_layers=1, batch_first=True)
     self.attention_aspect = Attention(opt.hidden_dim, score_function='bi_linear')
     self.attention_context = Attention(opt.hidden_dim, score_function='bi_linear')
     self.dense = nn.Linear(opt.hidden_dim*2, opt.polarities_dim)

Example #4

0

Show file

    def __init__(self, embeddings, hidden_dim, output_size, dropout_emb,
                 dropout_lstm):
        """
        Define the layers and initialize them.

        Pytorch initializes the layers by default, with random weights,
        sampled from certain distribution. However, in some cases
        you might want to explicitly initialize some layers,
        either by sampling from a different distribution,
        or by using pretrained weights (word embeddings / transfer learning)

        Args:
        """
        super(AttentionalLSTM, self).__init__()

        # 1) embedding layer:
        trainable_emb = False
        self.word_embeddings = nn.Embedding(num_embeddings=embeddings.shape[0],
                                            embedding_dim=embeddings.shape[1])
        self.init_embeddings(embeddings, trainable_emb)
        self.drop_emb = nn.Dropout(dropout_emb)

        # 2) LSTM layer
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_size=embeddings.shape[1],
                            hidden_size=hidden_dim,
                            batch_first=True,
                            dropout=dropout_lstm)
        self.drop_lstm = nn.Dropout(dropout_lstm)
        self.attention = Attention(attention_size=hidden_dim, batch_first=True)

        # 3) linear layer -> outputs
        self.hidden2output = nn.Linear(hidden_dim, output_size)

Example #5

0

Show file

File: memnet.py Project: mohit-sh/LC-ABSA

 def __init__(self, embedding_matrix, opt):
     super(MemNet, self).__init__()
     self.opt = opt
     self.embed = nn.Embedding.from_pretrained(
         torch.tensor(embedding_matrix, dtype=torch.float))
     self.squeeze_embedding = SqueezeEmbedding(batch_first=True)
     self.attention = Attention(opt.embed_dim, score_function='mlp')
     self.x_linear = nn.Linear(opt.embed_dim, opt.embed_dim)
     self.dense = nn.Linear(opt.embed_dim, opt.polarities_dim)

Example #6

0

Show file

    def __init__(self,
                 trg_n_tokens: int,
                 embed: nn.Module,
                 enc_size: int,
                 rnn_size: int,
                 rnn_layers: int,
                 rnn_dropout: float,
                 out_dropout: float,
                 rnn_type: str,
                 outputs_fn: str,
                 tie_projections: bool,
                 attention_fn: str,
                 input_feeding: bool,
                 out_non_linearity: str,
                 out_layer_norm: bool,
                 tau_0: float = 1,
                 learn_tau: bool = False,
                 length_control: bool = False,
                 input_shortcut: bool = False,
                 latent_dim: int = 0,
                 **kwargs):
        super(AttSeqDecoder, self).__init__()

        assert outputs_fn in {"sum", "concat"}

        # ----------------------------------------------------
        # Attributes
        # ----------------------------------------------------
        self.trg_n_tokens = trg_n_tokens
        self.input_feeding = input_feeding
        self.input_shortcut = input_shortcut
        self.learn_tau = learn_tau
        self.length_control = length_control
        self.out_non_linearity = out_non_linearity
        self.out_layer_norm = out_layer_norm
        self.tie_projections = tie_projections
        self.rnn_type = rnn_type
        self.outputs_fn = outputs_fn
        self.gs_beta = 1
        self.latent_dim = latent_dim

        # ----------------------------------------------------
        # Layers
        # ----------------------------------------------------
        self.embed = embed

        # the output size of the ho token: ho = [ h || c]
        self.o_size = self.embed.embedding_dim if tie_projections else rnn_size
        dec_inp_dim = self.embed.embedding_dim
        if self.input_feeding:
            dec_inp_dim += self.o_size

        rnn = nn.GRU if rnn_type == "GRU" else nn.LSTM
        self.rnn = rnn(input_size=dec_inp_dim,
                       hidden_size=rnn_size,
                       num_layers=rnn_layers,
                       dropout=rnn_dropout if rnn_layers > 1 else 0.,
                       batch_first=True)

        self.out_dropout = nn.Dropout(out_dropout)

        self.attention = Attention(enc_size, rnn_size + latent_dim,
                                   method=attention_fn)

        # learnt temperature parameter
        if self.learn_tau:
            linear_tau = nn.Linear(rnn_size, 1, bias=False)
            self.softplus = nn.Sequential(linear_tau, nn.Softplus())
            self.tau_0 = tau_0

        # -------------------------------------------------------------
        # projection layers
        # -------------------------------------------------------------
        if self.outputs_fn == "sum":
            self.W_c = nn.Linear(enc_size, self.o_size)
            self.W_h = nn.Linear(rnn_size, self.o_size)
            if latent_dim > 0:
                self.W_z = nn.Linear(latent_dim, self.o_size)

            if self.input_shortcut:
                self.W_e = nn.Linear(dec_inp_dim, self.o_size)

        elif self.outputs_fn == "concat":
            _concat_dim = enc_size + rnn_size + latent_dim
            if self.input_shortcut:
                _concat_dim += dec_inp_dim

            self.W_h = nn.Linear(_concat_dim, self.o_size)

        self.logits = nn.Linear(self.o_size, trg_n_tokens)

        if self.out_layer_norm:
            self.norm_outs = nn.LayerNorm(self.o_size, eps=1e-6)

        self.init_weights()
        self.tie_weights()

Example #7

0

Show file

class AttSeqDecoder(nn.Module):
    def __init__(self,
                 trg_n_tokens: int,
                 embed: nn.Module,
                 enc_size: int,
                 rnn_size: int,
                 rnn_layers: int,
                 rnn_dropout: float,
                 out_dropout: float,
                 rnn_type: str,
                 outputs_fn: str,
                 tie_projections: bool,
                 attention_fn: str,
                 input_feeding: bool,
                 out_non_linearity: str,
                 out_layer_norm: bool,
                 tau_0: float = 1,
                 learn_tau: bool = False,
                 length_control: bool = False,
                 input_shortcut: bool = False,
                 latent_dim: int = 0,
                 **kwargs):
        super(AttSeqDecoder, self).__init__()

        assert outputs_fn in {"sum", "concat"}

        # ----------------------------------------------------
        # Attributes
        # ----------------------------------------------------
        self.trg_n_tokens = trg_n_tokens
        self.input_feeding = input_feeding
        self.input_shortcut = input_shortcut
        self.learn_tau = learn_tau
        self.length_control = length_control
        self.out_non_linearity = out_non_linearity
        self.out_layer_norm = out_layer_norm
        self.tie_projections = tie_projections
        self.rnn_type = rnn_type
        self.outputs_fn = outputs_fn
        self.gs_beta = 1
        self.latent_dim = latent_dim

        # ----------------------------------------------------
        # Layers
        # ----------------------------------------------------
        self.embed = embed

        # the output size of the ho token: ho = [ h || c]
        self.o_size = self.embed.embedding_dim if tie_projections else rnn_size
        dec_inp_dim = self.embed.embedding_dim
        if self.input_feeding:
            dec_inp_dim += self.o_size

        rnn = nn.GRU if rnn_type == "GRU" else nn.LSTM
        self.rnn = rnn(input_size=dec_inp_dim,
                       hidden_size=rnn_size,
                       num_layers=rnn_layers,
                       dropout=rnn_dropout if rnn_layers > 1 else 0.,
                       batch_first=True)

        self.out_dropout = nn.Dropout(out_dropout)

        self.attention = Attention(enc_size, rnn_size + latent_dim,
                                   method=attention_fn)

        # learnt temperature parameter
        if self.learn_tau:
            linear_tau = nn.Linear(rnn_size, 1, bias=False)
            self.softplus = nn.Sequential(linear_tau, nn.Softplus())
            self.tau_0 = tau_0

        # -------------------------------------------------------------
        # projection layers
        # -------------------------------------------------------------
        if self.outputs_fn == "sum":
            self.W_c = nn.Linear(enc_size, self.o_size)
            self.W_h = nn.Linear(rnn_size, self.o_size)
            if latent_dim > 0:
                self.W_z = nn.Linear(latent_dim, self.o_size)

            if self.input_shortcut:
                self.W_e = nn.Linear(dec_inp_dim, self.o_size)

        elif self.outputs_fn == "concat":
            _concat_dim = enc_size + rnn_size + latent_dim
            if self.input_shortcut:
                _concat_dim += dec_inp_dim

            self.W_h = nn.Linear(_concat_dim, self.o_size)

        self.logits = nn.Linear(self.o_size, trg_n_tokens)

        if self.out_layer_norm:
            self.norm_outs = nn.LayerNorm(self.o_size, eps=1e-6)

        self.init_weights()
        self.tie_weights()

    def init_weights(self):
        pass

    def tie_weights(self):
        if self.tie_projections:
            self.logits.weight = self.embed.embedding.weight
            # self.embed.embedding.weight = self.logits.weight

    def _step_emb(self, step, trg, logits, sampling_prob, sampling_mode, tau):
        """
        Get the token embedding for the current timestep. Possible options:
        - select the embedding by a given index
        - sample a token from a probability distribution and embed
        - construct a "fuzzy" embedding, by taking a convex combination of all
        the token embeddings, parameterized by a probability distribution

        Note: At the last timestep (step == max_length), when by definition
        there is not a target word that is given to us, generation a distribution
        from the logits regardless of whether the model is trained with
        teacher-forcing, scheduled-sampling and/or gumbel-softmax.

        """
        batch, max_length = trg.size()

        if sampling_prob == 1 or coin_flip(sampling_prob) or step == max_length:
            assert sampling_mode in ["argmax", "gs", "st", "gs-st", "softmax"]

            # get the argmax
            if sampling_mode == "argmax":
                maxv, maxi = logits.max(dim=2)
                e_i = self.embed(maxi)
                return e_i, id2dist(maxi, self.logits.out_features).unsqueeze(1)

            # get the expected embedding, parameterized by the posterior
            elif sampling_mode in ["gs", "st", "gs-st", "softmax"]:

                # add gumbel noise only during training
                _add_gumbel = self.training and sampling_mode in ["gs", "gs-st"]

                # discretize the distributions if Straight-Trough is used
                hard = sampling_mode in ["st", "gs-st"]

                # make sure not to generate <pad>,
                # because it's a zero embedding and we'll get no gradients.
                pad_mask = torch.zeros_like(logits)
                pad_mask[:, :, 0] = torch.min(logits)
                logits = logits + pad_mask

                dist = relax_softmax(logits, tau, gumbel=_add_gumbel, hard=hard,
                                     beta=self.gs_beta)
                e_i = self.embed.expectation(dist)
                return e_i, dist
            else:
                raise NotImplementedError
        else:
            w_i = trg[:, step].unsqueeze(1)
            e_i = self.embed(w_i)
            return e_i, id2dist(w_i, self.logits.out_features).unsqueeze(1)

    def _step_input(self, embeddings, input_feed=None, tick=None):
        """
        Create the input to the decoder for a given step
        """
        batch = embeddings.size(0)
        _input = embeddings

        if self.input_feeding:
            if input_feed is None:
                with torch.no_grad():
                    input_feed = torch.zeros(batch, 1, self.o_size,
                                             dtype=_input.dtype,
                                             device=_input.device)
            _input = torch.cat((embeddings, input_feed), -1)
        if self.length_control:
            _input = torch.cat((_input, tick), -1)

        return _input

    def _step(self, inp, enc_outputs, state, src_mask, latent=None):
        """
        Perform one decoding step.
        1. Feed the input to the decoder and obtain the contextualized
            token representations.
        2. Generate a context vector. It is a convex combination of the
            states of the encoder, the weights of which are a function of each
            state of the encoder and the current state of the decoder.
        3. Re-weight the decoder's state with the context vector.
        4. Project the context-aware vector to the vocabulary.

        """
        # 1. Feed the input to the decoder
        outputs, state = self.rnn(inp, state)

        # 2. Generate the context vector
        query = outputs.squeeze(1)
        if latent is not None:
            query = torch.cat([query, latent], 1)
        contexts, att_scores = self.attention(enc_outputs, query, src_mask)

        # apply dropout before combining the features
        outputs = self.out_dropout(outputs)
        contexts = self.out_dropout(contexts)

        # 3. Re-weight the decoder's state with the context vector.
        if self.outputs_fn == "sum":
            o = self.W_h(outputs) + self.W_c(contexts)

            if self.input_shortcut:
                o = o + self.W_e(inp)

            if self.latent_dim > 0:
                o = o + self.W_z(latent)

        elif self.outputs_fn == "concat":
            o = torch.cat([outputs, contexts], 2)

            if self.input_shortcut:
                o = torch.cat([o, inp], 2)

            if self.latent_dim > 0:
                o = torch.cat([o, latent.unsqueeze(1)], 2)

            o = self.W_h(o)


        else:
            raise ValueError

        if self.out_layer_norm:
            o = self.norm_outs(o)

        if self.out_non_linearity == "relu":
            o = torch.relu(o)
        elif self.out_non_linearity == "tanh":
            o = torch.tanh(o)

        # 4. Project the context-aware vector to the vocabulary.
        logits = self.logits(o)

        return logits, outputs, state, o, att_scores

    def forward(self,
                trg: Tensor,
                enc_outputs: Tensor,
                init_hidden,
                enc_lengths: Tensor,
                src_mask: Tensor,
                trg_lengths: Tensor = None,
                word_dropout=0,
                sampling=0.0,
                sampling_mode="argmax",
                tau=1.0,
                lm: nn.Module = None,
                fusion=None,
                fusion_a=None,
                lm_state=None,
                latent=None,
                **kwargs):
        """

        Returns:
            Note: dists contain one less element than logits, because
            we do not care about sampling from the last timestep as it will not
            be used for sampling another token.
            The last timestep should correspond to the EOS token, and the
            corresponding logit will be used only for computing the NLL loss
            of the EOS token.

        When the decoder is used for supervised learning you can use
        the given x_i tokens (teacher-forcing). However, you can also sample
        the next token (scheduled-sampling), or even approximate it with
        the gumbel-relaxation and back-propagate through the sample.

        For unsupervised learning you may need the distributions p_i that
        generated the samples.

        dists         p_0 ~─┐   p_1  ~── ...      p_n
        (sampling)     ~    │    ~       ...       ~
        logits        u_0   │   u_1      ...      u_n
                       ↑    │    ↑       ...       ↑
                      c_0   │   c_1      ...      c_n
                       ↑    │    ↑       ...       ↑
        outputs       h_0   │   h_1      ...      h_n
                       ↑    │    ↑       ...       ↑
                      e_0   └─> e_1      ... ──>  e_n

                       ↑         ↑                 ↑
                      x_0       x_1               x_n
                     (<s>)

        """
        results = {"logits": [], "outputs": [], "attention": [],
                   "dists": [], "gate": [], "tau": [], "samples": [],
                   "dec": [], "lm": []}
        batch, max_length = trg.size()

        # ------------------------------------------------------------------
        # Prepare Decoding
        # ------------------------------------------------------------------
        # initial hidden state of the decoder, and initial context
        state_i = init_hidden
        context_i, o_i, tick = None, None, None

        # if we are doing inference, then simply take the argmax
        if self.training is False:
            sampling_mode = "argmax"

        # pre-compute source state projections for efficiency
        if self.attention.method in ["general", "additive"]:
            self.attention.precompute_enc_projections(enc_outputs)

        if self.length_control:
            countdown = length_countdown(trg_lengths).float() * self.W_tick
            ratio = trg_lengths.float() / enc_lengths.float()

        # At the first step (step==0) select the embedding of the given
        # target word (usually the <sos> token).
        e_i, d_i = self._step_emb(step=0, trg=trg, logits=None, sampling_prob=0,
                                  sampling_mode="argmax", tau=1)

        # ------------------------------------------------------------------
        # Decoding Loop
        # ------------------------------------------------------------------
        for i in range(max_length):

            # ---------------------------------------------------------
            # 1. construct time-step input
            # ---------------------------------------------------------
            if i > 0:
                e_i = F.dropout2d(e_i, word_dropout, self.training)

            # the number of remaining tokens
            if self.length_control:
                tick = torch.stack((countdown[:, i], ratio), -1).unsqueeze(1)

            input_i = self._step_input(e_i, o_i, tick)

            # ---------------------------------------------------------
            # 2. perform one decoding step
            # ---------------------------------------------------------
            step_i = self._step(input_i, enc_outputs, state_i, src_mask, latent)
            logits_i, outs_i, state_i, o_i, att_i = step_i

            # ---------------------------------------------------------
            # 3. obtain the NEXT input word embedding
            # ---------------------------------------------------------

            # feed the input to the prior and interpolate with the decoder
            if fusion is not None:
                assert lm is not None
                _len = torch.ones(batch, device=e_i.device, dtype=torch.long)
                lm_outs = lm(d_i.max(dim=2)[1], _len, lm_state)
                lm_logits, lm_state = lm_outs["logits"], lm_outs["hidden"]
                results["lm"].append(lm_logits)
                results["dec"].append(logits_i)
                logits_i = lm_fusion(logits_i, lm_logits, fusion, fusion_a)

            # generation the temperature value for the next sampling step
            if self.learn_tau and self.training:
                tau = 1 / (self.softplus(outs_i.squeeze()) + self.tau_0)
                results["tau"].append(tau)

            # select or sample the next input
            e_i, d_i = self._step_emb(step=i + 1, trg=trg, logits=logits_i,
                                      sampling_prob=sampling,
                                      sampling_mode=sampling_mode, tau=tau)

            # ---------------------------------------------------------
            results["logits"].append(logits_i)
            results["outputs"].append(outs_i)
            results["attention"].append(att_i.unsqueeze(1))
            results["dists"].append(d_i)
            results["samples"].append(e_i)

        results = {k: torch.cat(v, dim=1) if len(v) > 0 else None
                   for k, v in results.items()}

        return results

Example #8

0

Show file

    def __init__(self, trg_ntokens, enc_size, **kwargs):
        super(AttSeqDecoder, self).__init__()

        ############################################
        # Attributes
        ############################################
        self.trg_ntokens = trg_ntokens
        emb_size = kwargs.get("emb_size", 100)
        embed_noise = kwargs.get("embed_noise", .0)
        embed_dropout = kwargs.get("embed_dropout", .0)
        rnn_size = kwargs.get("rnn_size", 100)
        rnn_layers = kwargs.get("rnn_layers", 1)
        rnn_dropout = kwargs.get("rnn_dropout", .0)
        tie_weights = kwargs.get("tie_weights", False)
        attention_fn = kwargs.get("attention_fn", "general")
        self.input_feeding = kwargs.get("input_feeding", False)
        self.learn_tau = kwargs.get("learn_tau", False)
        self.length_control = kwargs.get("length_control", False)
        self.gumbel = kwargs.get("gumbel", False)
        self.out_non_linearity = kwargs.get("out_non_linearity", None)
        self.layer_norm = kwargs.get("layer_norm", None)
        self.input_feeding_learnt = kwargs.get("input_feeding_learnt", False)

        ############################################
        # Layers
        ############################################
        self.embed = Embed(trg_ntokens,
                           emb_size,
                           noise=embed_noise,
                           dropout=embed_dropout)

        # the output size of the ho token: ho = [ h || c]
        if tie_weights:
            self.ho_size = emb_size
        else:
            self.ho_size = rnn_size

        dec_input_size = emb_size
        if self.input_feeding:
            dec_input_size += self.ho_size
        if self.length_control:
            dec_input_size += 2

            # length scaling parameter
            self.W_tick = nn.Parameter(torch.rand(1))

        self.rnn = nn.LSTM(input_size=dec_input_size,
                           hidden_size=rnn_size,
                           num_layers=rnn_layers,
                           batch_first=True)

        self.rnn_dropout = nn.Dropout(rnn_dropout)

        self.attention = Attention(enc_size, rnn_size, method=attention_fn)

        # learnt temperature parameter
        if self.learn_tau:
            self.softplus = nn.Sequential(
                nn.Linear(self.ho_size, 1, bias=False), nn.Softplus())
            self.tau_0 = kwargs.get("tau_0", 1)

        # initial input feeding
        if self.input_feeding_learnt:
            self.Wi = nn.Linear(enc_size, self.ho_size)

        # source context-aware output projection
        self.Wc = nn.Linear(rnn_size + enc_size, self.ho_size)

        # projection layer to the vocabulary
        self.Wo = nn.Linear(self.ho_size, trg_ntokens)

        if self.layer_norm:
            self.norm_ctx = nn.LayerNorm(self.ho_size)

            if self.input_feeding_learnt:
                self.norm_input_feed = nn.LayerNorm(self.ho_size)

        if tie_weights:
            # if rnn_size != emb_size:
            #     raise ValueError("if `tie_weights` is True,"
            #                      "emb_size has to be equal to rnn_size")
            self.Wo.weight = self.embed.embedding.weight