Ejemplo n.º 1
0
 def __init__(self,
              ntoken,
              h_dim,
              emb_dim,
              nlayers,
              chunk_size,
              wdrop=0,
              dropouth=0.5):
     super(sentence_encoder, self).__init__()
     self.lockdrop = LockedDropout()
     self.hdrop = nn.Dropout(dropouth)
     self.encoder = nn.Embedding(ntoken, emb_dim)
     self.rnn = ONLSTMStack([emb_dim] + [h_dim] * nlayers,
                            chunk_size=chunk_size,
                            dropconnect=wdrop,
                            dropout=dropouth)
     initrange = 0.1
     self.encoder.weight.data.uniform_(-initrange, initrange)
     self.h_dim = h_dim
     self.emb_dim = emb_dim
     self.nlayers = nlayers
     self.ntoken = ntoken
     self.chunk_size = chunk_size
     self.wdrop = wdrop
     self.dropouth = dropouth
Ejemplo n.º 2
0
    def __init__(self, config):
        super(Bert_withLSTM, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.lstm = ONLSTMStack([config.hidden_size, config.hidden_size],
                                chunk_size=8)
        self.qa_outputs = torch.nn.Linear(config.hidden_size,
                                          config.num_labels)

        self.init_weights()
    def __init__(self,
                 rnn_type,
                 ntoken,
                 ninp,
                 nhid,
                 chunk_size,
                 nlayers,
                 dropout=0.5,
                 dropouth=0.5,
                 dropouti=0.5,
                 dropoute=0.1,
                 wdrop=0,
                 tie_weights=False,
                 args=None):
        super(GPTRNNModel, self).__init__()
        self.transformer = OpenAIGPTModel.from_pretrained('openai-gpt')
        config = OpenAIGPTConfig()
        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight,
                                       config)
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Linear(768, ninp)
        self.args = args

        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack([ninp] + [nhid] * (nlayers - 1) + [ninp],
                               chunk_size=chunk_size,
                               dropconnect=wdrop,
                               dropout=dropouth)
        self.decoder = nn.Linear(ninp, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        # if tie_weights:
        #     #if nhid != ninp:
        #     #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
        #     self.decoder.weight = self.encoder.weight
        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.distance = None
        self.tie_weights = tie_weights
Ejemplo n.º 4
0
    def __init__(self,
                 rnn_type,
                 ntoken,
                 ninp,
                 nhid,
                 chunk_size,
                 nlayers,
                 wds='no',
                 dropout=0.5,
                 dropouth=0.5,
                 dropouti=0.5,
                 dropoute=0.1,
                 wdrop=0,
                 tie_weights=False,
                 l4d=0):
        super(RNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack([ninp] + [nhid] * (nlayers - 1) + [ninp],
                               l4d=l4d,
                               chunk_size=chunk_size,
                               wds=wds,
                               dropconnect=wdrop,
                               dropout=dropouth)
        self.decoder = nn.Linear(ninp, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            # if nhid != ninp:
            #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights
Ejemplo n.º 5
0
 def __init__(self, word2vec, embedding_size, hidden_state_size, layer_size, sentence_embedding_size,
              chunk_size, batch_size, gcn_hidden_size, gcn_output_size, dropout = 0.5):
     super(ModelEncoder, self).__init__()
     
     self.word2vec = word2vec
     self.batch_size = batch_size
     
     self.gcn_input_size = embedding_size
     self.gcn_hidden_size = gcn_hidden_size
     self.gcn_output_size = gcn_output_size
     self.dropout = dropout
     
     self.encoder = ONLSTMStack([embedding_size] + [hidden_state_size] * (layer_size - 1) + [sentence_embedding_size], chunk_size)
Ejemplo n.º 6
0
class ModelEncoder(nn.Module):
    def __init__(self, word2vec, embedding_size, hidden_state_size, layer_size, sentence_embedding_size,
                 chunk_size, batch_size, gcn_hidden_size, gcn_output_size, dropout = 0.5):
        super(ModelEncoder, self).__init__()
        
        self.word2vec = word2vec
        self.batch_size = batch_size
        
        self.gcn_input_size = embedding_size
        self.gcn_hidden_size = gcn_hidden_size
        self.gcn_output_size = gcn_output_size
        self.dropout = dropout
        
        self.encoder = ONLSTMStack([embedding_size] + [hidden_state_size] * (layer_size - 1) + [sentence_embedding_size], chunk_size)
    
    def forward(self, sentences, hidden):
        sentences = get_word_embedding(self.word2vec, sentences)
        # sentences size: length * batch_size * word_embedding_size
        raw_output, hidden_cell, raw_outputs, outputs, distance = self.encoder(sentences, hidden)
        # hidden layer_size * batch_size * hidden_state_size
        
        distances = distance[0]
        layer_size, length, batch_size = distances.size()
        distances = distances[-1] # we use the gates of the last layer as the weights of the tree.
        distances = distances.transpose(1, 0)
        
        word_hidden_state = []
        for i in range(batch_size):
            gcn = ONLSTMGraph(distances[i], self.gcn_input_size, self.gcn_hidden_size,
                              self.gcn_output_size)
            word_hidden_state.append(gcn(sentences[i]).tolist())
        return raw_output[-1], word_hidden_state
    def init_hidden(self, batch_size):
        return self.encoder.init_hidden(batch_size)
Ejemplo n.º 7
0
class sentence_encoder(nn.Module):
    ### take in a sentence, return its encoded embedding and hidden states(for attention)
    def __init__(self,
                 ntoken,
                 h_dim,
                 emb_dim,
                 nlayers,
                 chunk_size,
                 wdrop=0,
                 dropouth=0.5):
        super(sentence_encoder, self).__init__()
        self.lockdrop = LockedDropout()
        self.hdrop = nn.Dropout(dropouth)
        self.encoder = nn.Embedding(ntoken, emb_dim)
        self.rnn = ONLSTMStack([emb_dim] + [h_dim] * nlayers,
                               chunk_size=chunk_size,
                               dropconnect=wdrop,
                               dropout=dropouth)
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.h_dim = h_dim
        self.emb_dim = emb_dim
        self.nlayers = nlayers
        self.ntoken = ntoken
        self.chunk_size = chunk_size
        self.wdrop = wdrop
        self.dropouth = dropouth

    def forward(self, inp_sentence, hidden):
        emb = self.encoder(inp_sentence)
        print('inp sen: ', inp_sentence)
        print('emb: ', emb)
        output, hidden, raw_outputs, outputs, distances = self.rnn(emb, hidden)
        self.distance = distances
        result = output.view(output.size(0) * output.size(1), output.size(2))
        '''
		 It seems that the 'hidden' is the encoding output and final cell states of layers
		 the 'result' is (2-d) the hidden output of the last layers
		 the 'outputs' is the stack of 'result' in layers
		'''

        return result.permute(0, 1), hidden, raw_outputs, outputs

    def init_hidden(self, bsz):
        return self.rnn.init_hidden(bsz)
class GPTRNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""
    def __init__(self,
                 rnn_type,
                 ntoken,
                 ninp,
                 nhid,
                 chunk_size,
                 nlayers,
                 dropout=0.5,
                 dropouth=0.5,
                 dropouti=0.5,
                 dropoute=0.1,
                 wdrop=0,
                 tie_weights=False,
                 args=None):
        super(GPTRNNModel, self).__init__()
        self.transformer = OpenAIGPTModel.from_pretrained('openai-gpt')
        config = OpenAIGPTConfig()
        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight,
                                       config)
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Linear(768, ninp)
        self.args = args

        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack([ninp] + [nhid] * (nlayers - 1) + [ninp],
                               chunk_size=chunk_size,
                               dropconnect=wdrop,
                               dropout=dropouth)
        self.decoder = nn.Linear(ninp, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        # if tie_weights:
        #     #if nhid != ninp:
        #     #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
        #     self.decoder.weight = self.encoder.weight
        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.distance = None
        self.tie_weights = tie_weights

    def reset(self):
        if self.rnn_type == 'QRNN': [r.reset() for r in self.rnns]

    def init_weights(self, pre_emb):
        initrange = 0.1
        # self.encoder.weight.data.uniform_(-initrange, initrange)
        # if pre_emb is not None:
        #     self.encoder.weight.data[:pre_emb.size(0), :pre_emb.size(1)] = torch.FloatTensor(pre_emb)

        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, gpt_ids, fl_ids, return_h=False):
        if self.args.feature is not None and 'fixGPT' in self.args.feature.split(
                '_'):
            with torch.no_grad():
                emb = self.transformer(gpt_ids)
        else:
            emb = self.transformer(gpt_ids)  # BS * GPT_SL * GPT_EMS

        lm_logits = self.lm_head(emb)
        # shift_logits = lm_logits[..., :-1, :].contiguous()
        emb = torch.cat(
            [emb[r:r + 1, fl_ids[r], :] for r in range(len(fl_ids))],
            dim=0)  # BS * (2*SL) * GPT_ES
        emb = torch.nn.functional.avg_pool1d(emb.permute(0, 2, 1),
                                             2) * 2  # BS * GPT_EMS * SL
        emb = emb.permute(2, 0, 1)  # BS * SL * GPT_EMS -> SL * BS * ES
        self.encoder = embedded_dropout_gpt(
            self.encoder, dropout=self.dropoute if self.training else 0)
        emb = nn.functional.relu(self.encoder(emb))
        emb = self.lockdrop(emb, self.dropouti)

        raw_output, hidden, raw_outputs, outputs, self.distance = self.rnn(
            emb, hidden)

        output = self.lockdrop(raw_output, self.dropout)

        result = output.view(output.size(0) * output.size(1), output.size(2))
        if return_h:
            return result, hidden, raw_outputs, outputs, lm_logits.view(
                -1, lm_logits.size(-1))
        else:
            return result, hidden

    def init_hidden(self, bsz):
        return self.rnn.init_hidden(bsz)
Ejemplo n.º 9
0
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, chunk_size, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False):
        super(RNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack(
            [ninp] + [nhid] * (nlayers - 1) + [ninp],
            chunk_size=chunk_size,
            dropconnect=wdrop,
            dropout=dropouth
        )

        # self.decoder = nn.Linear(ninp, ntoken)

        self.prob = nn.Linear(1, 15)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        # if tie_weights:
        #     #if nhid != ninp:
        #     #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
        #     self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights

        self.embedding = nn.Linear(768, 400)
        self.drop_out = nn.Dropout(p=dropoute)
        # self.linear = nn.Linear(400, ntoken)
        
        self.sen_out = nn.Sequential(
            nn.Conv1d(10, 5, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(5, 1, 3, stride=1, padding=1),
        )
        self.result = nn.Sequential(
            nn.Conv1d(19, 8, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(8, 1, 3, stride=1, padding=1),
        )
        self.word_rnn = ONLSTMStack(
            [ninp] + [nhid] * (nlayers - 1) + [ninp],
            chunk_size=chunk_size,
            dropconnect=wdrop,
            dropout=dropouth
        )


    def reset(self):
        if self.rnn_type == 'QRNN': [r.reset() for r in self.rnns]

    def init_weights(self):
        initrange = 0.1
        # self.encoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.fill_(0)
        # self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, inputs, cand_ids, hidden, hidd, hidd_cand):
        ## suppose batch_size = 80
        inputs_ = inputs.view(inputs.size(0)*inputs.size(1), inputs.size(2)).transpose(0, 1)  #[80, 15, 10] -> [10, 1200]
        cand_ids_ = cand_ids.view(cand_ids.size(0)*cand_ids.size(1), cand_ids.size(2)).transpose(0, 1)   #[80, 4, 10] -> [10, 320]

        emb = embedded_dropout(
            self.encoder, inputs_,
            dropout=self.dropoute if self.training else 0
        )
        emb = self.lockdrop(emb, self.dropouti)   #[10, 1200, 400]
        emb_out, _, _, _, _ = self.word_rnn(emb, hidd)     #[10, 1200, 400]
        sen_emb = self.sen_out(emb_out.permute(1, 0, 2))  #[1200, 1, 400]
        sen_emb = sen_emb.view(inputs.size(0), inputs.size(1), -1).transpose(0, 1) #[80, 15, 400] -> [15, 80, 400]

        cand_emb = embedded_dropout(
            self.encoder, cand_ids_,
            dropout=self.dropoute if self.training else 0
        )
        cand_emb = self.lockdrop(cand_emb, self.dropouti)   #[10, 320, 400]
        cand_emb_out, _, _, _, _ = self.word_rnn(cand_emb, hidd_cand)     #[10, 320, 400]
        cand_sen_emb = self.sen_out(cand_emb_out.permute(1, 0, 2))   #[320, 1, 400]
        cand_sen_emb = cand_sen_emb.view(cand_ids.size(0), cand_ids.size(1), -1).transpose(0, 1)  #[4, 80, 400]       

        ##1. language modeling

        # raw_output, hidden, raw_outputs, outputs, distances = self.rnn(emb, hidden)
        # self.distance = distances

        # output = self.lockdrop(raw_output, self.dropout)
        # result = output.view(output.size(0)*output.size(1), output.size(2))
        # result_prob = self.decoder(result)

        ##2. classification
        raw_output, hidden, raw_outputs, outputs, distances = self.rnn(sen_emb, hidden)
        self.distance = distances

        output = self.lockdrop(raw_output, self.dropout)

        output = output.permute(1, 0, 2)
        result = self.result(output)
        cand_scores = torch.matmul(result, cand_sen_emb.permute(1, 2, 0)).squeeze(1)

        return result, cand_scores, hidden, raw_outputs, outputs, cand_emb

    def init_hidden(self, bsz):
        return self.rnn.init_hidden(bsz)
Ejemplo n.º 10
0
    def __init__(self, rnn_type, ntoken, ninp, nhid, chunk_size, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False):
        super(RNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack(
            [ninp] + [nhid] * (nlayers - 1) + [ninp],
            chunk_size=chunk_size,
            dropconnect=wdrop,
            dropout=dropouth
        )

        # self.decoder = nn.Linear(ninp, ntoken)

        self.prob = nn.Linear(1, 15)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        # if tie_weights:
        #     #if nhid != ninp:
        #     #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
        #     self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights

        self.embedding = nn.Linear(768, 400)
        self.drop_out = nn.Dropout(p=dropoute)
        # self.linear = nn.Linear(400, ntoken)
        
        self.sen_out = nn.Sequential(
            nn.Conv1d(10, 5, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(5, 1, 3, stride=1, padding=1),
        )
        self.result = nn.Sequential(
            nn.Conv1d(19, 8, 3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv1d(8, 1, 3, stride=1, padding=1),
        )
        self.word_rnn = ONLSTMStack(
            [ninp] + [nhid] * (nlayers - 1) + [ninp],
            chunk_size=chunk_size,
            dropconnect=wdrop,
            dropout=dropouth
        )
Ejemplo n.º 11
0
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, chunk_size, nlayers, dropout=0.5, dropouth=0.5, dropouti=0.5, dropoute=0.1, wdrop=0, tie_weights=False):
        super(RNNModel, self).__init__()
        self.lockdrop = LockedDropout()
        self.idrop = nn.Dropout(dropouti)
        self.hdrop = nn.Dropout(dropouth)
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        assert rnn_type in ['LSTM'], 'RNN type is not supported'
        self.rnn = ONLSTMStack(
            [ninp] + [nhid] * (nlayers - 1) + [ninp],
            chunk_size=chunk_size,
            dropconnect=wdrop,
            dropout=dropouth
        )
        self.decoder = nn.Linear(ninp, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            #if nhid != ninp:
            #    raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.dropout = dropout
        self.dropouti = dropouti
        self.dropouth = dropouth
        self.dropoute = dropoute
        self.tie_weights = tie_weights

    def reset(self):
        if self.rnn_type == 'QRNN': [r.reset() for r in self.rnns]

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden, return_h=False):
        emb = embedded_dropout(
            self.encoder, input,
            dropout=self.dropoute if self.training else 0
        )

        emb = self.lockdrop(emb, self.dropouti)

        raw_output, hidden, raw_outputs, outputs, distances = self.rnn(emb, hidden)
        self.distance = distances

        output = self.lockdrop(raw_output, self.dropout)

        result = output.view(output.size(0)*output.size(1), output.size(2))
        if return_h:
            return result, hidden, raw_outputs, outputs
        else:
            return result, hidden

    def init_hidden(self, bsz):
        return self.rnn.init_hidden(bsz)
Ejemplo n.º 12
0
class Bert_withLSTM(BertPreTrainedModel):
    def __init__(self, config):
        super(Bert_withLSTM, self).__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.lstm = ONLSTMStack([config.hidden_size, config.hidden_size],
                                chunk_size=8)
        self.qa_outputs = torch.nn.Linear(config.hidden_size,
                                          config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output = outputs[0]
        lstm_in = sequence_output.permute(1, 0, 2)
        _, bsz, _ = lstm_in.size()
        lstm_in = self.lstm(lstm_in, self.lstm.init_hidden(bsz))

        lstm_in = lstm_in[0].permute(1, 0, 2)

        logits = self.qa_outputs(lstm_in)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        outputs = (
            start_logits,
            end_logits,
        ) + outputs[2:]
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss, ) + outputs

        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)