def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        discriminator_hidden_states = self.electra(input_ids, attention_mask, token_type_ids)

        #print(discriminator_hidden_states.shape) 

        # (batch_size, max_length, hidden_size)
        discriminator_hidden_states = discriminator_hidden_states[0] #맨 위층 layer     

        # (batch_size, max_length, hidden_size) -> (batch_size, hidden_size)
        lstm_output, (hidden, cell) = self.biLSTM(discriminator_hidden_states)
        #cls_output = lstm_output[:, 0, :] #[batch, length, hidden]
        cls_output = self.dropout(lstm_output)
  
        cls_output.unsqueeze_(1)
        conved = [conv(cls_output).squeeze(3) for conv in self.CNN]
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved]
        concated = torch.cat(pooled, dim = 1)
        cls_output = self.linear_cnn(concated)

        # (batch_size, hidden_size) -> (batch_size, hidden_size)
        cls_output = self.linear_1(cls_output)
        cls_output = get_activation("gelu")(cls_output)
        cls_output = self.dropout(cls_output)

        # (batch_size, hidden_size) -> (batch_size, num_labels)
        cls_output = self.linear_2(cls_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(cls_output, labels) #*****loss_fct(예측값: 2차원(batch, 2:0,1), 정답값: 1차원(batch))*****

            return loss, self.softmax(cls_output)
        else:
            return self.softmax(cls_output)
Exemple #2
0
def activate(var, method):
    """
    An activation function.
    :param var: input var
    :param method: type of activation, such as `relu`,`tanh`,`sigmoid`
    """
    from activations import get_activation
    return get_activation(method)(var)
Exemple #3
0
 def forward(self, features, **kwargs):
     x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
     x = self.dropout(x)
     x = self.dense(x)
     x = get_activation("gelu")(x)  # although BERT uses tanh here, it seems Electra authors used gelu here
     x = self.dropout(x)
     x = self.out_proj(x)
     return x
Exemple #4
0
 def get_train_output(self, input):
     scores = T.dot(input, self.W)
     if self.use_bias:
         scores += self.b
     output = activations.get_activation(activ_type=self.activation,
                                         x=scores,
                                         leak_slope=self.leak_slope,
                                         clip_threshold=self.clip_threshold)
     return (output)
    def __init__(self, rng, input, n_in, n_hidden, n_out, activations="tanh"):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        activation = act.get_activation(activations)
        self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in,
                                       n_out=n_hidden, activation=activation)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out)

        self.L1 = (abs(self.hiddenLayer.W).sum()
                   + abs(self.logRegressionLayer.W).sum())

        self.L2_sqr = ((self.hiddenLayer.W ** 2).sum()
                       + (self.logRegressionLayer.W ** 2).sum())

        # negative log likelihood of the MLP
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood
        )
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        # end-snippet-3

        # keep track of model input
        self.input = input
Exemple #6
0
    def forward(self, generator_hidden_states):
        hidden_states = self.dense(generator_hidden_states)
        hidden_states = get_activation("gelu")(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)

        return hidden_states
Exemple #7
0
    def forward(self, discriminator_hidden_states):
        hidden_states = self.dense(discriminator_hidden_states)
        hidden_states = get_activation(self.config.hidden_act)(hidden_states)
        logits = self.dense_prediction(hidden_states).squeeze()

        return logits
Exemple #8
0
 def __init__(self, activation, **params):
     self.activation_name = activation
     self.activation = get_activation(self.activation_name)
     self._last_input = None
     super(Activation, self).__init__(**params)
Exemple #9
0
    def __init__(self, rng, input, n_in, n_hidden, n_out, activations="tanh",
                 use_bias=True, dropout=False, dropout_rate=0):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        #
        # For Dropout, we basically need to set up two different MLPs
        # - one with dropout layers (used for training) and one for
        # prediction. [Question -- can we get error bounds if we run
        # forward propagation on the random dropout network a bunch of
        # times?  Might these be calibrated probabilities? Probably not...

        # Not sure if this is necessary -- but just in case for now.
        if not dropout:
            dropout_rate = 0

        activation = act.get_activation(activations)

        next_layer_input = input
        next_dropout_layer_input = _dropout_from_layer(
            rng, input, dropout_rate=dropout_rate)

        next_dropout_layer = DropoutHiddenLayer(
            rng=rng, input=next_dropout_layer_input,
            n_in=n_in, n_out=n_hidden, activation=activation,
            use_bias=use_bias, dropout_rate=dropout_rate)

        next_dropout_layer_input = next_dropout_layer.output

        # Reuse the parameters from the dropout layer here, in a different
        # path through the graph.
        # [Could be a constructor that takes a dropout hidden layer.]
        next_layer = HiddenLayer(
            rng=rng, input=next_layer_input,
            activation=activation,
            # scale the weight matrix W with probability of keeping
            W=next_dropout_layer.W * (1 - dropout_rate),
            b=next_dropout_layer.b,
            n_in=n_in, n_out=n_hidden,
            use_bias=use_bias)

        next_layer_input = next_layer.output

        # Now we set up the logistic regression (i.e. softmax) output
        # layers for the dropout network and the regular network
        self.dropout_output_layer = LogisticRegression(
            input=next_dropout_layer_input, n_in=n_hidden, n_out=n_out)

        self.output_layer = LogisticRegression(
            input=next_layer_input, n_in=n_hidden, n_out=n_out,
            W=self.dropout_output_layer.W * (1-dropout_rate),
            b=self.dropout_output_layer.b)

        # self.L1 = (abs(self.hiddenLayer.W).sum()
        #            + abs(self.logRegressionLayer.W).sum())
        # self.L2_sqr = ((self.hiddenLayer.W ** 2).sum()
        #                + (self.logRegressionLayer.W ** 2).sum())

        self.dropout_nll = self.dropout_output_layer.negative_log_likelihood
        self.dropout_errors = self.dropout_output_layer.errors
        self.nll = self.output_layer.negative_log_likelihood
        self.errors = self.output_layer.errors

        # The parameters for dropout and non-dropout are the same, but
        # we need to add the ones in the dropout layers, because those
        # are the shared variables... the ones in next_layer are
        # derived versions.
        self.params = self.dropout_output_layer.params + next_dropout_layer.params

        # keep track of model input
        self.input = input