def forward(self, features, **kwargs):
        x = self.dense(features)
        x = gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary with bias
        x = self.decoder(x) + self.bias

        return x
 def forward(self, sequence_output):
     x = self.dense(sequence_output)
     x = gelu(x)
     hidden_states = self.layer_norm(x)
     # print("hidden_states",hidden_states)
     prediction_scores = None
     for i in range(hidden_states.size(1)):
         if i == 0:
             tmp_state = hidden_states[:, i, :].unsqueeze(1)
             prediction_scores = self.op[i](tmp_state)
         else:
             tmp_state = hidden_states[:, i, :].unsqueeze(1)
             tmp = self.op[i](tmp_state)
             prediction_scores = torch.cat((prediction_scores, tmp), 1)
     # print("prediction_scores",prediction_scores.size(),prediction_scores)
     return prediction_scores
 def forward(self, sequence_output):
     x = self.dense(sequence_output)
     x = gelu(x)
     hidden_states = self.layer_norm(x)
     prediction_scores = self.decoder(hidden_states) + self.bias
     return prediction_scores