Example #1
0
class FlowQA(nn.Module):
    """Network for the FlowQA Module."""
    def __init__(self, opt, embedding=None, padding_idx=0):
        super(FlowQA, self).__init__()

        # Input size to RNN: word emb + char emb + question emb + manual features
        doc_input_size = 0
        que_input_size = 0

        layers.set_my_dropout_prob(opt['my_dropout_p'])
        layers.set_seq_dropout(opt['do_seq_dropout'])

        if opt['use_wemb']:
            # Word embeddings
            self.embedding = nn.Embedding(opt['vocab_size'],
                                          opt['embedding_dim'],
                                          padding_idx=padding_idx)
            if embedding is not None:
                self.embedding.weight.data = embedding
                if opt['fix_embeddings'] or opt['tune_partial'] == 0:
                    opt['fix_embeddings'] = True
                    opt['tune_partial'] = 0
                    for p in self.embedding.parameters():
                        p.requires_grad = False
                else:
                    assert opt['tune_partial'] < embedding.size(0)
                    fixed_embedding = embedding[opt['tune_partial']:]
                    # a persistent buffer for the nn.Module
                    self.register_buffer('fixed_embedding', fixed_embedding)
                    self.fixed_embedding = fixed_embedding
            embedding_dim = opt['embedding_dim']
            doc_input_size += embedding_dim
            que_input_size += embedding_dim
        else:
            opt['fix_embeddings'] = True
            opt['tune_partial'] = 0

        if opt['CoVe_opt'] > 0:
            self.CoVe = layers.MTLSTM(opt, embedding)
            CoVe_size = self.CoVe.output_size
            doc_input_size += CoVe_size
            que_input_size += CoVe_size

        if opt['use_elmo']:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            doc_input_size += 1024
            que_input_size += 1024
        if opt['use_pos']:
            self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim'])
            doc_input_size += opt['pos_dim']
        if opt['use_ner']:
            self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim'])
            doc_input_size += opt['ner_dim']

        if opt['do_prealign']:
            self.pre_align = layers.GetAttentionHiddens(
                embedding_dim,
                opt['prealign_hidden'],
                similarity_attention=True)
            doc_input_size += embedding_dim
        if opt['no_em']:
            doc_input_size += opt['num_features'] - 3
        else:
            doc_input_size += opt['num_features']

        # Setup the vector size for [doc, question]
        # they will be modified in the following code
        doc_hidden_size, que_hidden_size = doc_input_size, que_input_size
        print('Initially, the vector_sizes [doc, query] are', doc_hidden_size,
              que_hidden_size)

        flow_size = opt['hidden_size']
        if opt['residual_step']:
            flow_size = flow_size * 2

        # RNN document encoder
        self.doc_rnn1 = layers.StackedBRNN(doc_hidden_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        if opt['flow_attention']:
            self.dialog_flow1 = layers.FlowRNN(
                opt['hidden_size'] * 2,
                opt['hidden_size'],
                num_layers=1,
                rnn_type=nn.GRU,
                bidir=False,
                residual_step=opt['residual_step'],
                cof=opt['cof'])
        else:
            self.dialog_flow1 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                                   opt['hidden_size'],
                                                   num_layers=1,
                                                   rnn_type=nn.GRU,
                                                   bidir=False)

        self.doc_rnn2 = layers.StackedBRNN(opt['hidden_size'] * 2 + flow_size +
                                           CoVe_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        if opt['flow_attention']:
            self.dialog_flow2 = layers.FlowRNN(
                opt['hidden_size'] * 2,
                opt['hidden_size'],
                num_layers=1,
                rnn_type=nn.GRU,
                bidir=False,
                residual_step=opt['residual_step'],
                cof=opt['cof'])
        else:
            self.dialog_flow2 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                                   opt['hidden_size'],
                                                   num_layers=1,
                                                   rnn_type=nn.GRU,
                                                   bidir=False)
        doc_hidden_size = opt['hidden_size'] * 2

        # RNN question encoder
        self.question_rnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size,
            opt['hidden_size'],
            opt,
            num_layers=2,
            concat_rnn=opt['concat_rnn'],
            add_feat=CoVe_size)

        # Output sizes of rnn encoders
        print('After Input LSTM, the vector_sizes [doc, query] are [',
              doc_hidden_size, que_hidden_size, '] * 2')

        # Deep inter-attention
        self.deep_attn = layers.DeepAttention(
            opt,
            abstr_list_cnt=2,
            deep_att_hidden_size_per_abstr=opt[
                'deep_att_hidden_size_per_abstr'],
            do_similarity=opt['deep_inter_att_do_similar'],
            word_hidden_size=embedding_dim + CoVe_size,
            no_rnn=True)

        self.deep_attn_rnn, doc_hidden_size = layers.RNN_from_opt(
            self.deep_attn.att_final_size + flow_size,
            opt['hidden_size'],
            opt,
            num_layers=1)
        if opt['flow_attention']:
            self.dialog_flow3 = layers.FlowRNN(
                doc_hidden_size,
                opt['hidden_size'],
                num_layers=1,
                rnn_type=nn.GRU,
                bidir=False,
                residual_step=opt['residual_step'],
                cof=opt['cof'])
        else:
            self.dialog_flow3 = layers.StackedBRNN(doc_hidden_size,
                                                   opt['hidden_size'],
                                                   num_layers=1,
                                                   rnn_type=nn.GRU,
                                                   bidir=False)

        # Question understanding and compression
        self.high_lvl_qrnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size * 2,
            opt['hidden_size'],
            opt,
            num_layers=1,
            concat_rnn=True)

        # Self attention on context
        if opt['use_hoc']:
            att_size = doc_hidden_size + 2 * opt['hidden_size'] * 2 + 2 * opt[
                'hidden_size']
        elif opt['residual_step']:
            att_size = doc_hidden_size + 2 * opt['hidden_size'] * 2
        else:
            att_size = doc_hidden_size + 2 * opt['hidden_size'] * 2

        if opt['self_attention_opt'] > 0:
            self.highlvl_self_att = layers.GetAttentionHiddens(
                att_size, opt['deep_att_hidden_size_per_abstr'])
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size * 2 + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)
            print('Self deep-attention {} rays in {}-dim space'.format(
                opt['deep_att_hidden_size_per_abstr'], att_size))
        elif opt['self_attention_opt'] == 0:
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)

        print('Before answer span finding, hidden size are', doc_hidden_size,
              que_hidden_size)

        # Question merging
        self.self_attn = layers.LinearSelfAttn(que_hidden_size)
        if opt['do_hierarchical_query']:
            self.hier_query_rnn = layers.StackedBRNN(que_hidden_size,
                                                     opt['hidden_size'],
                                                     num_layers=1,
                                                     rnn_type=nn.GRU,
                                                     bidir=False)
            que_hidden_size = opt['hidden_size']

        # Attention for span start/end
        self.get_answer = layers.GetSpanStartEnd(doc_hidden_size,
                                                 que_hidden_size, opt,
                                                 opt['ptr_net_indep_attn'],
                                                 opt["ptr_net_attn_type"],
                                                 opt['do_ptr_update'])

        self.ans_type_prediction = layers.BilinearLayer(
            doc_hidden_size * 2, que_hidden_size, opt['answer_type_num'])

        # Store config
        self.opt = opt

    def forward(self,
                x1,
                x1_c,
                x1_f,
                x1_pos,
                x1_ner,
                x1_mask,
                x2_full,
                x2_c,
                x2_full_mask,
                x3=None):
        """Inputs:
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        x3 = answer word indices [batch * q_num * len_a]
        """

        # precomputing ELMo is only for context (to speedup computation)
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(x1_c)
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                self.precomputed_cnt = 0

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(self.elmo._scalar_mixes)):
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))

            x1_elmo = representations[0][:, :x1.size(1), :]
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        """
        x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1),
                                         x1.size(1)).contiguous()
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        drnn_input_list, qrnn_input_list = [], []

        x2 = x2_full.view(-1, x2_full.size(-1))
        x2_mask = x2_full_mask.view(-1, x2_full.size(-1))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)
            x2_emb = emb(x2)
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)

            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)
        x2_input = torch.cat(qrnn_input_list, dim=2)

        def expansion_for_doc(z):
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))

        x1_emb_expand = expansion_for_doc(x1_emb)
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)
        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt['no_em']:
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1))

        if self.opt['do_prealign']:
            x1_atten = self.pre_align(x1_emb_expand, x2_emb, x2_mask)
            x1_input = torch.cat([x1_input, x1_atten], dim=2)

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        def flow_operation(cur_h, flow):
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            if self.opt['residual_step']:
                flow_out, residual_out = flow(flow_in)
            else:
                flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            if self.opt['residual_step']:
                residual_out = residual_out.transpose(0, 1).view(
                    x1_full.size(1), x1_full.size(0), x1_full.size(2),
                    -1).transpose(0, 2).contiguous()
                residual_out = residual_out.view(
                    x1_full.size(2),
                    x1_full.size(0) * x1_full.size(1), -1).transpose(0, 1)
                return flow_out, residual_out
            else:
                return flow_out, None

        # Encode document with RNN
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(x1_input, x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow1)

        doc_abstr_ls.append(doc_hiddens)
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2), x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow2)

        doc_abstr_ls.append(doc_hiddens)
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)

        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass

        # Encode question with RNN
        _, que_abstr_ls = self.question_rnn(x2_input,
                                            x2_mask,
                                            return_list=True,
                                            additional_x=x2_cove_high)

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)

        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)
        doc_hiddens_flow, residual_flow = flow_operation(
            doc_hiddens, self.dialog_flow3)

        doc_abstr_ls += [doc_hiddens]
        #doc_hiddens_flow = torch.cat((doc_hiddens_flow, residual_flow), dim=2)
        #if self.opt['residual_step']:
        #doc_abstr_ls.append(residual_flow)

        # Self Attention Fusion Layer
        if self.opt['use_hoc']:
            # handle history of context, considering batch=1
            x1_att = torch.cat(doc_abstr_ls, 2)
            hoc = torch.cat(
                (doc_hiddens[0, :, :].unsqueeze(0), doc_hiddens[:-1, :, :]),
                dim=0)
            x1_att = torch.cat((x1_att, hoc), dim=2)
        else:
            x1_att = torch.cat(doc_abstr_ls, 2)

        if self.opt['self_attention_opt'] > 0:
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask)

        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_avg_hidden = layers.weighted_avg(question_hiddens,
                                                  q_merge_weights)
        if self.opt['do_hierarchical_query']:
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))

        # Get Start, End span
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        # doc_hiddens: [bsz * max_qa_pair, context_length, hidden_size]
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1

        return all_start_scores, all_end_scores, all_class_scores
Example #2
0
class FlowQA(nn.Module):
    """Network for the FlowQA Module."""
    def __init__(self, opt, embedding=None, padding_idx=0):
        super(FlowQA, self).__init__()

        # Input size to RNN: word emb + char emb + question emb + manual features
        doc_input_size = 0
        que_input_size = 0

        layers.set_my_dropout_prob(opt['my_dropout_p'])
        layers.set_seq_dropout(opt['do_seq_dropout'])

        if opt['use_wemb']:
            # Word embeddings
            self.embedding = nn.Embedding(opt['vocab_size'],
                                          opt['embedding_dim'],
                                          padding_idx=padding_idx)
            # print('embeddingggg{}'.format(embedding))
            # print('embeddingweight{}'.format(self.embedding.weight))
            if embedding is not None:
                # self.embedding.weight.data = embedding
                self.embedding.weight.data.copy_(embedding)
                # print('embeddingweight{}'.format(self.embedding.weight))
                if opt['fix_embeddings'] or opt['tune_partial'] == 0:
                    opt['fix_embeddings'] = True
                    opt['tune_partial'] = 0
                    for p in self.embedding.parameters():
                        p.requires_grad = False
                else:
                    assert opt['tune_partial'] < embedding.size(0)
                    fixed_embedding = embedding[opt['tune_partial']:]
                    # a persistent buffer for the nn.Module
                    self.register_buffer('fixed_embedding', fixed_embedding)
                    self.fixed_embedding = fixed_embedding
            embedding_dim = opt['embedding_dim']
            doc_input_size += embedding_dim
            que_input_size += embedding_dim
        else:
            opt['fix_embeddings'] = True
            opt['tune_partial'] = 0

        if opt['CoVe_opt'] > 0:
            self.CoVe = layers.MTLSTM(opt, embedding)
            CoVe_size = self.CoVe.output_size
            doc_input_size += CoVe_size
            que_input_size += CoVe_size

        if opt['use_elmo']:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            doc_input_size += 1024
            que_input_size += 1024
        if opt['use_pos']:
            self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim'])
            doc_input_size += opt['pos_dim']
        if opt['use_ner']:
            self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim'])
            doc_input_size += opt['ner_dim']

        if opt['do_prealign']:
            self.pre_align = layers.GetAttentionHiddens(
                embedding_dim,
                opt['prealign_hidden'],
                similarity_attention=True)
            doc_input_size += embedding_dim
        if opt['no_em']:
            doc_input_size += opt['num_features'] - 3
        else:
            doc_input_size += opt['num_features']

        # Setup the vector size for [doc, question]
        # they will be modified in the following code
        doc_hidden_size, que_hidden_size = doc_input_size, que_input_size
        print('Initially, the vector_sizes [doc, query] are', doc_hidden_size,
              que_hidden_size)

        flow_size = opt['hidden_size']

        # RNN document encoder
        self.doc_rnn1 = layers.StackedBRNN(doc_hidden_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        self.dialog_flow1 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)
        self.doc_rnn2 = layers.StackedBRNN(opt['hidden_size'] * 2 + flow_size +
                                           CoVe_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        self.dialog_flow2 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)
        doc_hidden_size = opt['hidden_size'] * 2

        # RNN question encoder
        self.question_rnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size,
            opt['hidden_size'],
            opt,
            num_layers=2,
            concat_rnn=opt['concat_rnn'],
            add_feat=CoVe_size)

        # Output sizes of rnn encoders
        print('After Input LSTM, the vector_sizes [doc, query] are [',
              doc_hidden_size, que_hidden_size, '] * 2')

        #graph
        self.graph_encoder = layers.GraphEncoder(embedding_dim,
                                                 opt['hidden_size'])

        # Deep inter-attention
        self.deep_attn = layers.DeepAttention(
            opt,
            abstr_list_cnt=2,
            deep_att_hidden_size_per_abstr=opt[
                'deep_att_hidden_size_per_abstr'],
            do_similarity=opt['deep_inter_att_do_similar'],
            word_hidden_size=embedding_dim + CoVe_size,
            no_rnn=True)

        self.deep_attn_rnn, doc_hidden_size = layers.RNN_from_opt(
            self.deep_attn.att_final_size + flow_size,
            opt['hidden_size'],
            opt,
            num_layers=1)
        self.dialog_flow3 = layers.StackedBRNN(doc_hidden_size,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)

        # Question understanding and compression
        self.high_lvl_qrnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size * 2,
            opt['hidden_size'],
            opt,
            num_layers=1,
            concat_rnn=True)

        # Self attention on context
        att_size = doc_hidden_size + 2 * opt['hidden_size'] * 2

        if opt['self_attention_opt'] > 0:
            self.highlvl_self_att = layers.GetAttentionHiddens(
                att_size, opt['deep_att_hidden_size_per_abstr'])
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size * 2 + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)
            print('Self deep-attention {} rays in {}-dim space'.format(
                opt['deep_att_hidden_size_per_abstr'], att_size))
        elif opt['self_attention_opt'] == 0:
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)

        print('Before answer span finding, hidden size are', doc_hidden_size,
              que_hidden_size)

        # Question merging
        self.self_attn = layers.LinearSelfAttn(que_hidden_size)
        if opt['do_hierarchical_query']:
            self.hier_query_rnn = layers.StackedBRNN(que_hidden_size,
                                                     opt['hidden_size'],
                                                     num_layers=1,
                                                     rnn_type=nn.GRU,
                                                     bidir=False)
            que_hidden_size = opt['hidden_size']

        # Attention for span start/end
        self.get_answer = layers.GetSpanStartEnd(doc_hidden_size,
                                                 que_hidden_size, opt,
                                                 opt['ptr_net_indep_attn'],
                                                 opt["ptr_net_attn_type"],
                                                 opt['do_ptr_update'])

        self.ans_type_prediction = layers.BilinearLayer(
            doc_hidden_size * 2, que_hidden_size, opt['answer_type_num'])

        # Store config
        self.opt = opt

    def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c,
                x2_full_mask, node_id, node_mask, edge_id):
        """Inputs:
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        node_id     [batch * max_node_num * max_node_length]
        node__mask   [batch * max_node_num * max_node_length]
        edge_id     [batch * max_node_num * max_node_num ]
        """
        # print('node_id{}'.format(node_id))
        # print('x1{}'.format(x1))
        # precomputing ELMo is only for context (to speedup computation)
        # print('startembeddingweight{}'.format(self.embedding.weight))
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(x1_c)
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                self.precomputed_cnt = 0

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(self.elmo._scalar_mixes)):
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))

            x1_elmo = representations[0][:, :x1.size(1), :]
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1 = document word indices             [batch * len_d]
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        node_id     [batch * max_node_num * max_node_length]
        node__mask   [batch * max_node_num * max_node_length]
        edge_id     [batch * max_node_num * max_node_num ]
        """
        # x1_full [batch * 1 * len_d] -> batch, q_num, len_d
        # x1_full_mask batch, q_num, len_d
        x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1),
                                         x1.size(1)).contiguous()
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        #[batch * max_node_num * max_node_length]-> batch, 1, max_node_num, max_node_length -> batch, q_num , max_node_num, max_node_length
        # node=node_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_id.size(1), node_id.size(2)).contiguous()
        # node_full_mask=node_mask.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), node_mask.size(1), node_mask.size(2)).contiguous()
        # edge=edge_id.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1), edge_id.size(1), edge_id.size(2)).contiguous()
        node = node_id.view(-1, node_id.size(
            -1)).contiguous()  #(batch*max_node_num), max_node_length
        # print('node{}'.format(node))
        node_full_mask = node_mask.view(
            -1, node_mask.size(-1))  ##(batch*max_node_num), max_node_length

        drnn_input_list, qrnn_input_list, grnn_input_list = [], [], []
        # x2  [(batch * q_num) * len_q]
        # x2_mask [(batch * q_num) * len_q]
        x2 = x2_full.view(-1, x2_full.size(-1))  #((batch*q_num),len_q)
        x2_mask = x2_full_mask.view(-1,
                                    x2_full.size(-1))  #((batch*q_num),len_q)
        # print('embeddingweight{}'.format(self.embedding.weight))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)  #batch, len_d, emb_size
            x2_emb = emb(x2)  #(batch * q_num), q_length, emb_size
            node_emb = emb(
                node)  #(batch*max_node_num), max_node_length, emb_size

            # print('node_emb{}'.format(node_emb[0, 0, :]))
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                node_emb = layers.dropout(node_emb,
                                          p=self.opt['dropout_emb'],
                                          training=self.training)
            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)
            grnn_input_list.append(node_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)  #MTLSTM
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # node_cove_mid, node_cove_high = self.CoVe(node, node_full_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                # node_cove_mid = layers.dropout(node_cove_mid, p=self.opt['dropout_emb'], training=self.training)
                # node_cove_high = layers.dropout(node_cove_high, p=self.opt['dropout_emb'], training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)  #barch,len_d,?
        x2_input = torch.cat(qrnn_input_list, dim=2)  #(batch*q_num),len_q,??
        node_input = torch.cat(
            grnn_input_list,
            dim=2)  #(batch*max_node_num), max_node_length, emb_size

        def expansion_for_doc(z):
            #x2_full = question word indices        [batch * q_num * len_q]
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))

        # x1_emb batch, len_d, emb_size  x1_emb_expand
        x1_emb_expand = expansion_for_doc(
            x1_emb)  #(batch*q_num),len_d,emb_size
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)

        #node_id     [batch * max_node_num * max_node_length]
        # node_emb  (batch*max_node_num), max_node_length, emb_size ->batch, max_node_num, max_node_length, emb_size
        #batch, 1,  max_node_num, max_node_length, emb_size->batch, q_num,  max_node_num, max_node_length, emb_size
        pre_node_emb = node_emb.view(node_id.size(0), node_id.size(1),
                                     node_emb.size(1),
                                     node_emb.size(2)).contiguous()
        pre_node_emb_expand = pre_node_emb.unsqueeze(1).expand(
            pre_node_emb.size(0), x2_full.size(1), pre_node_emb.size(1),
            pre_node_emb.size(2), pre_node_emb.size(3)).contiguous()
        #(batch*q_num), max_node_num, max_node_length, emb_size
        #要用的
        node_emb_expand = pre_node_emb_expand.view(-1,
                                                   pre_node_emb_expand.size(2),
                                                   pre_node_emb_expand.size(3),
                                                   pre_node_emb_expand.size(4))

        #node_mask [batch * max_node_num * max_node_length]->[batch * q_num * max_node_num * max_node_length]
        pre_node_emb_expand_mask = node_mask.unsqueeze(1).expand(
            node_mask.size(0), x2_full.size(1), node_mask.size(1),
            node_mask.size(2)).contiguous()
        # #(batch*q_num),  max_node_num, max_node_length
        #要用的
        node_emb_expand_mask = pre_node_emb_expand_mask.view(
            -1, node_mask.size(1), node_mask.size(2))

        #edge_id     [batch * max_node_num * max_node_num ]  [batch *q_num * max_node_num * max_node_num ]
        pre_edge_expand = edge_id.unsqueeze(1).expand(
            edge_id.size(0), x2_full.size(1), edge_id.size(1),
            edge_id.size(2)).contiguous()
        edge_expand = pre_edge_expand.view(-1, edge_id.size(1),
                                           edge_id.size(2))

        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt['no_em']:
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(
            -1, x1_full_mask.size(-1))  # (batch*q_num, len_d)

        if self.opt['do_prealign']:
            x1_atten = self.pre_align(
                x1_emb_expand, x2_emb,
                x2_mask)  # # batch*q_num* lend * xq_input_size
            x1_input = torch.cat([x1_input, x1_atten], dim=2)

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        # x1_full: [batch_size, max_qa_pair, context_length]   x1_full = document word indices [batch * q_num * len_d]
        def flow_operation(cur_h, flow):
            # ( len_d, batch*q_num, hidden_size)-> len_d,batch, q_num , hidden_size
            #例如执行view操作之后,不会开辟新的内存空间来存放处理之后的数据,实际上新数据与原始数据共享同一块内存。
            # #而在调用contiguous()之后,PyTorch会开辟一块新的内存空间存放变换之后的数据,并会真正改变Tensor的内容,按照变换之后的顺序存放数据。
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            #q_num,batch, len_d , hidden_size  q_num, batch*lend ,hidden
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            return flow_out

        # Encode document with RNN
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(
            x1_input, x1_mask)  # (batch*q_num, len_d, hidden_size)
        graph_output = self.graph_encoder(
            doc_hiddens, x1_mask, node_emb_expand, node_emb_expand_mask,
            edge_expand)  # bsz', max_node_num,  hidden
        # doc_hiddens=graph_output
        doc_hiddens_flow = flow_operation(
            doc_hiddens,
            self.dialog_flow1)  # [bsz * q_num, len_d, flow_hidden_state_dim]

        doc_abstr_ls.append(graph_output)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2),
            x1_mask)  #opt['hidden_size'] * 2 + flow_size + CoVe_size
        doc_hiddens_flow = flow_operation(
            doc_hiddens,
            self.dialog_flow2)  # [bsz * q_num, len_d, flow_hidden_state_dim]
        doc_abstr_ls.append(doc_hiddens)

        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass

        # Encode question with RNN  x2_input (batch*q_num),len_q,x2_input_size
        _, que_abstr_ls = self.question_rnn(
            x2_input, x2_mask, return_list=True,
            additional_x=x2_cove_high)  # [((batch*q_num), len_q, hidden_size)]

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        # x1_emb_expand x1_cove_high_expand (batch*q_num),len_d,emb_size   doc_abstr_ls  [(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)]
        # x2_em (batch*q_num),len_q,embsize)  que_abstr_ls [(batch*q_num), len_q, hidden_size),(batch*q_num), len_q, hidden_size)]
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)  # # batch*q_num * len1 * x2_input_size
        #doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3)

        doc_abstr_ls += [
            doc_hiddens
        ]  #[(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size),(batch*q_num, len_d, hidden_size)]

        # Self Attention Fusion Layer
        x1_att = torch.cat(doc_abstr_ls, 2)

        if self.opt['self_attention_opt'] > 0:
            #x1_att  c1,c2,c3
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            ##  highlvl_self_attn_hiddens batch * len1 * x2_input_size fully aware context on c3
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2),
                x1_mask)  # (batch*q_num, seq_len, hidden_size)

        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_avg_hidden = layers.weighted_avg(
            question_hiddens, q_merge_weights)  #(batch*q_num )* hidden
        if self.opt['do_hierarchical_query']:
            #x1_full: [batch_size, q_num context_length]
            #question_avg_hidden  (bsz, q_num, hidden)
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))  # (batch*q_num ), hidden

        # Get Start, End span
        # question_avg_hidden (batch*q_num ), hidden   doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        #doc_hiddens doc_hiddens_flow [bsz * q_num, len_d, flow_hidden_state_dim]
        #torch max 函数会返回两个tensor,第一个tensor是每行的最大值 第二个tensor是每行最大值的索引
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)  #batch, hidden
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1

        return all_start_scores, all_end_scores, all_class_scores
Example #3
0
class FlowQA(nn.Module):
    """Network for the FlowQA Module."""
    def __init__(self, opt, embedding=None, padding_idx=0):
        super(FlowQA, self).__init__()

        # Input size to RNN: word emb + char emb + question emb + manual features
        # 依我看 Input size = word embedding + CoVe embedding + Elmo embedding
        #这个 +question emb是什么意思啊
        doc_input_size = 0
        que_input_size = 0

        layers.set_my_dropout_prob(opt['my_dropout_p'])
        layers.set_seq_dropout(opt['do_seq_dropout'])

        if opt['use_wemb']:
            # Word embeddings
            self.embedding = nn.Embedding(opt['vocab_size'],
                                          opt['embedding_dim'],
                                          padding_idx=padding_idx)
            if embedding is not None:
                self.embedding.weight.data = embedding
                if opt['fix_embeddings'] or opt['tune_partial'] == 0:
                    opt['fix_embeddings'] = True
                    opt['tune_partial'] = 0
                    for p in self.embedding.parameters():
                        p.requires_grad = False
                else:
                    assert opt['tune_partial'] < embedding.size(
                        0)  #tune_partial == 1000 只是微调前1000个最常用的word的embedding
                    fixed_embedding = embedding[opt['tune_partial']:]
                    # a persistent buffer for the nn.Module
                    self.register_buffer('fixed_embedding', fixed_embedding)
                    '''
                    register_buffer(name, tensor)
                    给module添加一个persistent buffer。
                    
                    persistent buffer通常被用在这么一种情况:我们需要保存一个状态,但是这个状态不能看作成为模型参数。 例如:, BatchNorm’s running_mean 不是一个 parameter, 但是它也是需要保存的状态之一。
                    
                    Buffers可以通过注册时候的name获取。
                    
                    NOTE:我们可以用 buffer 保存 moving average
                    '''
                    self.fixed_embedding = fixed_embedding
            embedding_dim = opt['embedding_dim']
            doc_input_size += embedding_dim
            que_input_size += embedding_dim
        else:
            opt['fix_embeddings'] = True
            opt['tune_partial'] = 0

        if opt['CoVe_opt'] > 0:
            self.CoVe = layers.MTLSTM(opt, embedding)
            CoVe_size = self.CoVe.output_size
            doc_input_size += CoVe_size
            que_input_size += CoVe_size

        if opt['use_elmo']:
            options_file = "ElmoWeight/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
            weight_file = "ElmoWeight/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            doc_input_size += 1024
            que_input_size += 1024
        if opt['use_pos']:
            self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim'])
            doc_input_size += opt['pos_dim']

        if opt['use_ner']:
            self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim'])
            doc_input_size += opt['ner_dim']

        if opt['do_prealign']:
            self.pre_align = layers.GetAttentionHiddens(
                embedding_dim,
                opt['prealign_hidden'],
                similarity_attention=True)
            doc_input_size += embedding_dim
        if opt['no_em']:
            doc_input_size += opt['num_features'] - 3
        else:
            doc_input_size += opt['num_features']

        # Setup the vector size for [doc, question]
        # they will be modified in the following code
        doc_hidden_size, que_hidden_size = doc_input_size, que_input_size
        print('Initially, the vector_sizes [doc, query] are', doc_hidden_size,
              que_hidden_size)

        flow_size = opt['hidden_size']  #default=125

        # RNN document encoder
        self.doc_rnn1 = layers.StackedBRNN(doc_hidden_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        self.dialog_flow1 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)
        self.doc_rnn2 = layers.StackedBRNN(opt['hidden_size'] * 2 + flow_size +
                                           CoVe_size,
                                           opt['hidden_size'],
                                           num_layers=1)
        self.dialog_flow2 = layers.StackedBRNN(opt['hidden_size'] * 2,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)
        doc_hidden_size = opt['hidden_size'] * 2

        # RNN question encoder
        self.question_rnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size,
            opt['hidden_size'],
            opt,
            num_layers=2,
            concat_rnn=opt['concat_rnn'],
            add_feat=CoVe_size)

        # Output sizes of rnn encoders
        print('After Input LSTM, the vector_sizes [doc, query] are [',
              doc_hidden_size, que_hidden_size, '] * 2')  #为什么最后

        # Deep inter-attention
        self.deep_attn = layers.DeepAttention(
            opt,
            abstr_list_cnt=2,
            deep_att_hidden_size_per_abstr=opt[
                'deep_att_hidden_size_per_abstr'],
            do_similarity=opt['deep_inter_att_do_similar'],
            word_hidden_size=embedding_dim + CoVe_size,
            no_rnn=True)

        self.deep_attn_rnn, doc_hidden_size = layers.RNN_from_opt(
            self.deep_attn.att_final_size + flow_size,
            opt['hidden_size'],
            opt,
            num_layers=1)
        self.dialog_flow3 = layers.StackedBRNN(doc_hidden_size,
                                               opt['hidden_size'],
                                               num_layers=1,
                                               rnn_type=nn.GRU,
                                               bidir=False)

        # Question understanding and compression
        self.high_lvl_qrnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size * 2,
            opt['hidden_size'],
            opt,
            num_layers=1,
            concat_rnn=True)

        # Self attention on context
        att_size = doc_hidden_size + 2 * opt[
            'hidden_size'] * 2  #doc_hidden_size 也等于opt['hidden_size'] * 2

        if opt['self_attention_opt'] > 0:
            self.highlvl_self_att = layers.GetAttentionHiddens(
                att_size, opt['deep_att_hidden_size_per_abstr'])
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size * 2 + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)
            print('Self deep-attention {} rays in {}-dim space'.format(
                opt['deep_att_hidden_size_per_abstr'], att_size))
        elif opt['self_attention_opt'] == 0:
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)

        print('Before answer span finding, hidden size are', doc_hidden_size,
              que_hidden_size)

        # Question merging
        self.self_attn = layers.LinearSelfAttn(que_hidden_size)
        if opt['do_hierarchical_query']:
            self.hier_query_rnn = layers.StackedBRNN(que_hidden_size,
                                                     opt['hidden_size'],
                                                     num_layers=1,
                                                     rnn_type=nn.GRU,
                                                     bidir=False)
            que_hidden_size = opt['hidden_size']

        # Attention for span start/end
        self.get_answer = layers.GetSpanStartEnd(doc_hidden_size,
                                                 que_hidden_size, opt,
                                                 opt['ptr_net_indep_attn'],
                                                 opt["ptr_net_attn_type"],
                                                 opt['do_ptr_update'])

        self.ans_type_prediction = layers.BilinearLayer(
            doc_hidden_size * 2, que_hidden_size,
            opt['answer_type_num'])  #default=4

        # Store config
        self.opt = opt

    def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c,
                x2_full_mask):
        #在model_CoQA.py中的QA_model中被调用。
        #输入的9个是
        #context_id, context_cid, context_feature, context_tag, context_ent, context_mask,
        #           question_id, question_cid, question_mask,
        """Inputs:
        x1 = document word indices             [batch * len_d] len_d:len_document
        x1_c = document char indices           [batch * len_d * len_w] or [1]
        x1_c have precompute times batch example , that 's why , i hope i can got answer here
        x1_f = document word features indices  [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags             [batch * len_d]
        x1_ner = document entity tags          [batch * len_d]
        x1_mask = document padding mask        [batch * len_d]
        x2_full = question word indices        [batch * q_num * len_q]
        x2_c = question char indices           [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        """
        '''
        context_id, context_cid, context_feature, context_tag, context_ent, context_mask,
                   question_id, 
        x2_full = question_cid, 
        question_mask, overall_mask,
        '''

        # precomputing ELMo is only for context (to speedup computation)
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(
                    x1_c
                )  #这个_elmo_lstm()是会在一个句子的前后加上<s> 和</s> ,就是比batch_to_id给出的数据的sentence_len维度多2
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                #detach()从当前的图中分离,.cpu()放到cpu()上
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                #先一次性,将很多倍于batch_size的elmo向量拿出来
                self.precomputed_cnt = 0
                #下面precomputed_cnt 这个值会加1,程序采用这种做法,讲context的elmo的embeddding提前取出来,存在self.precomputed_layer_activates 和self.precomputed_mask_with_bos_eos中
                #每次还是取正常的一个batch大小,precompute_cnt的值会在0-elmo_batch_size // batch_size 之间变化

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            # 用precomputed_cnt * x1.size(0) 来计数,每个batch的训练,取这么多的数据
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(
                    self.elmo._scalar_mixes)):  #len(elmo._scalar_mixes) 就是等于2
                '''
                elmo._scalar_mixes =  [ScalarMix(
                  (scalar_parameters): ParameterList(
                      (0): Parameter containing: [torch.FloatTensor of size 1]
                      (1): Parameter containing: [torch.FloatTensor of size 1]
                      (2): Parameter containing: [torch.FloatTensor of size 1]
                  )
                ), ScalarMix(
                  (scalar_parameters): ParameterList(
                      (0): Parameter containing: [torch.FloatTensor of size 1]
                      (1): Parameter containing: [torch.FloatTensor of size 1]
                      (2): Parameter containing: [torch.FloatTensor of size 1]
                  )
                )]
                '''
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))
                #循环一共两遍,所以一共两个元素,每个元素是[句子个数, 句子长度, 1024]的尺度,这个句子长度中是不包含前后特殊符号的
                #而且在我的样例中,数值还是一样的,那为了什么要循环两次呢。

            x1_elmo = representations[0][:, :x1.size(
                1), :]  #x1.size(1)是为了截取最大长度以内的向量
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        x2_full question word indices          [batch * q_num * len_q]
        x2_full_mask = question padding mask   [batch * q_num * len_q]
        """
        # x1 [batch , len_d]-->unsqueeze(1)-->[batch , 1 , len_d] -->expand-->[batch , num_q , len_d]
        x1_full = x1.unsqueeze(1).expand(
            x2_full.size(0), x2_full.size(1),
            x1.size(1)).contiguous()  #第二个维度扩展为句子数目的维度
        # x1_mask [batch , len_d] --> [batch ,1 , len_d] -->[batch , num_q , len_d]
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        drnn_input_list, qrnn_input_list = [], [
        ]  #处理document的rnn和处理question的rnn

        x2 = x2_full.view(-1, x2_full.size(
            -1))  #[batch , q_num , len_q] -> [batch * q_num , len_q]
        x2_mask = x2_full_mask.view(-1, x2_full.size(-1))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)
            x2_emb = emb(x2)
            # Dropout on embeddings
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)

            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  #torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  #torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)
        x2_input = torch.cat(qrnn_input_list, dim=2)

        def expansion_for_doc(z):
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))
            #[batch * num_q , len_d , emb_dim]

        x1_emb_expand = expansion_for_doc(x1_emb)
        x1_cove_high_expand = expansion_for_doc(x1_cove_high)
        #x1_elmo_expand = expansion_for_doc(x1_elmo)
        if self.opt[
                'no_em']:  #x1_f = document word features indices  [batch * q_num * len_d * nfeat]
            x1_f = x1_f[:, :, :, 3:]

        x1_input = torch.cat([
            expansion_for_doc(x1_input),
            x1_f.view(-1, x1_f.size(-2), x1_f.size(-1))
        ],
                             dim=2)
        x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1))

        # Interaction Layer(1.flow  2.integration  两者交互)
        if self.opt[
                'do_prealign']:  #x1_emb_expand [batch * num_q , len_d, emb_dim] 这里面的emb_dim是最纯朴的单词的词向量,不是elmo也不是CoVe
            # x2_emb [batch * num_q , len_q , emb_dim]
            x1_atten = self.pre_align(
                x1_emb_expand, x2_emb, x2_mask
            )  #self.pre_align = layers.GetAttentionHiddens(embedding_dim, opt['prealign_hidden'], similarity_attention=True)

            x1_input = torch.cat([x1_input, x1_atten], dim=2)  #有了问题信息加权的篇章表示

        # === Start processing the dialog ===
        # cur_h: [batch_size * max_qa_pair, context_length, hidden_state]
        # flow : fn (rnn)
        # x1_full: [batch_size, max_qa_pair, context_length]
        def flow_operation(cur_h, flow):  #flow操作就是在经过rnn之前要保证对qa_pairs这个维度滚rnn
            # cur_h [batch * max_qa_pair, len_d , hidden * 2] --> [len_d , batch * num_q , hidden * 2] -> [len_d , batch , num_q , hidden * 2]
            flow_in = cur_h.transpose(0, 1).view(x1_full.size(2),
                                                 x1_full.size(0),
                                                 x1_full.size(1), -1)
            #         [len_d , batch , num_q , hidden * 2] -> [num_q ,batch * len_d , hidden * 2] ->[batch * len_d , num_q , hidden * 2]
            flow_in = flow_in.transpose(0, 2).contiguous().view(
                x1_full.size(1),
                x1_full.size(0) * x1_full.size(2), -1).transpose(0, 1)
            # [bsz * context_length, max_qa_pair, hidden_state]
            flow_out = flow(flow_in)
            # [bsz * context_length, max_qa_pair, flow_hidden_state_dim (hidden_state/2)]
            if self.opt['no_dialog_flow']:
                flow_out = flow_out * 0

            flow_out = flow_out.transpose(0, 1).view(x1_full.size(1),
                                                     x1_full.size(0),
                                                     x1_full.size(2),
                                                     -1).transpose(
                                                         0, 2).contiguous()
            flow_out = flow_out.view(x1_full.size(2),
                                     x1_full.size(0) * x1_full.size(1),
                                     -1).transpose(0, 1)
            # [bsz * max_qa_pair, context_length, flow_hidden_state_dim]
            return flow_out

        # Encode document with RNN; Passage and Question Interaction
        doc_abstr_ls = []

        doc_hiddens = self.doc_rnn1(x1_input,
                                    x1_mask)  #[batch , len_d , hidden * 2]
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow1)
        doc_abstr_ls.append(doc_hiddens)

        doc_hiddens = self.doc_rnn2(
            torch.cat((doc_hiddens, doc_hiddens_flow, x1_cove_high_expand),
                      dim=2), x1_mask)
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow2)
        doc_abstr_ls.append(doc_hiddens)
        '''
        #with open('flow_bef_att.pkl', 'wb') as output:
        #    pickle.dump(doc_hiddens_flow, output, pickle.HIGHEST_PROTOCOL)
        #while(1):
        #    pass
        '''

        # Encode question with RNN
        _, que_abstr_ls = self.question_rnn(x2_input,
                                            x2_mask,
                                            return_list=True,
                                            additional_x=x2_cove_high)
        # que_abstr_ls  将两层的问题向量都返回了,每一层都是[batch * q_num , len_q , hidden * 2]

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        #[batch * num_q , len_q , hidden * 2]
        que_abstr_ls += [question_hiddens]

        # Main Attention Fusion Layer
        doc_info = self.deep_attn(
            [torch.cat([x1_emb_expand, x1_cove_high_expand], 2)], doc_abstr_ls,
            [torch.cat([x2_emb, x2_cove_high], 2)], que_abstr_ls, x1_mask,
            x2_mask)
        # history-aware attention,(修改question的的某一层的向量的时候,将passage和question所有的层拼接起来作为query和key)
        # query:all_layer_cancated_passage, key:all_layer_concated_question, value:question_layer[i] when calculating the i-th question_layer embedding

        # 修改问题之后,注意力加权平均,得到与doc在len_d维度一样的tensor,拼接到第二个flow层输出的doc表征上
        doc_hiddens = self.deep_attn_rnn(
            torch.cat((doc_info, doc_hiddens_flow), dim=2), x1_mask)  #过了rnn的结果
        doc_hiddens_flow = flow_operation(doc_hiddens, self.dialog_flow3)
        doc_abstr_ls += [doc_hiddens]

        # Self Attention Fusion Layer
        # For Passage do self attention
        x1_att = torch.cat(
            doc_abstr_ls,
            2)  # x1_att是过往所有层passage结合question之后的信息在hid_dim维度上的拼接
        if self.opt['self_attention_opt'] > 0:
            highlvl_self_attn_hiddens = self.highlvl_self_att(
                x1_att, x1_att, x1_mask, x3=doc_hiddens, drop_diagonal=True)
            # 在第三个flow处  doc_hiddens:passage的在len_d的维度上走过rnn的 doc_hiddens_flow:passage在max_qa_pairs这个维度上走过rnn,即第三个,最后一个,flow的输出
            # 拼接之后在len_d这个维度上走过rnn
            doc_hiddens = self.high_lvl_crnn(
                torch.cat(
                    [doc_hiddens, highlvl_self_attn_hiddens, doc_hiddens_flow],
                    dim=2), x1_mask)
        elif self.opt['self_attention_opt'] == 0:
            doc_hiddens = self.high_lvl_crnn(
                torch.cat([doc_hiddens, doc_hiddens_flow], dim=2), x1_mask)
        doc_abstr_ls += [doc_hiddens]

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(
            question_hiddens, x2_mask
        )  #question_hiddens is the final question hidden layer [batch * num_q , len_q , hidden * 2]
        # 计算出了自注意力的权重,#这个不是真的自注意力机制,是利用一个额外的向量z,对各个hidden进行点乘的注意力分数
        question_avg_hidden = layers.weighted_avg(
            question_hiddens, q_merge_weights)  #按照自注意力权重获得加权平均
        #[batch , hid]
        if self.opt['do_hierarchical_query']:  #default True
            #                                                                  [batch, max_qa_pair , hid ]
            #                     [batch , max_qa_pair , hid]  只是单向的,所以隐层还是hid,我好奇他最后是取句子级别的最后一个隐层单元吗?还是有attention,pooling一下
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))  #[batch * max_qa_pair , hid]

        # Prediction Layer
        # Get Start, End span
        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        # both are [batch * q_num, len_d]
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        #                           torch.cat( [batch , hidden] ,[batch , hidden]  , dim = 1) -> [batch , 2 * hidden]
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)
        # 预测答案的类型
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1
        #all_class_scores 没有在最后的class_num 维度上归一化softmax,这是为了方式class_num = 1的情况吧,当种类数目是1的时候,结果无论真实的分数是什么,softmax之后都是1

        return all_start_scores, all_end_scores, all_class_scores
Example #4
0
class FlowQA(nn.Module):
    """Network for the FlowQA Module."""
    def __init__(self, opt, embedding=None, padding_idx=0):
        super(FlowQA, self).__init__()

        # Input size to RNN: word emb + char emb + question emb + manual features
        doc_input_size = 0
        que_input_size = 0

        layers.set_my_dropout_prob(opt['my_dropout_p'])
        layers.set_seq_dropout(opt['do_seq_dropout'])

        if opt['use_wemb']:
            # Word embeddings
            self.embedding = nn.Embedding(opt['vocab_size'],
                                          opt['embedding_dim'],
                                          padding_idx=padding_idx)
            if embedding is not None:
                self.embedding.weight.data = embedding
                # default:fix_embedding=Fales;tune_patial=1000;embedding.size(0)=89349
                if opt['fix_embeddings'] or opt['tune_partial'] == 0:
                    opt['fix_embeddings'] = True
                    opt['tune_partial'] = 0
                    for p in self.embedding.parameters():
                        p.requires_grad = False
                else:
                    assert opt['tune_partial'] < embedding.size(0)
                    fixed_embedding = embedding[opt['tune_partial']:]
                    # a persistent buffer for the nn.Module
                    # buffer的更新在forward中,optim.step只能更新nn.parameter类型的参数。
                    self.register_buffer('fixed_embedding', fixed_embedding)
                    self.fixed_embedding = fixed_embedding
            embedding_dim = opt['embedding_dim']
            doc_input_size += embedding_dim
            que_input_size += embedding_dim
        else:
            opt['fix_embeddings'] = True
            opt['tune_partial'] = 0
        # default:CoVe_opt=1
        if opt['CoVe_opt'] > 0:
            self.CoVe = layers.MTLSTM(opt, embedding)
            CoVe_size = self.CoVe.output_size
            # CoVe size = 600
            print('CoVe size:', CoVe_size)
            doc_input_size += CoVe_size
            que_input_size += CoVe_size
        if opt['use_elmo']:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
            options_file = '../MyIdea_v1/elmo/options.json'
            weight_file = '../MyIdea_v1/elmo/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
            self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
            doc_input_size += 1024
            que_input_size += 1024
        if opt['use_pos']:
            self.pos_embedding = nn.Embedding(opt['pos_size'], opt['pos_dim'])
            print('pos_dim:', opt['pos_dim'])
            doc_input_size += opt['pos_dim']
        if opt['use_ner']:
            self.ner_embedding = nn.Embedding(opt['ner_size'], opt['ner_dim'])
            print('ner_dim:', opt['ner_dim'])
            doc_input_size += opt['ner_dim']
        # # default默认true
        # if opt['do_prealign']:
        #     # default:embedding_dim=300,hidden=300
        #     self.pre_align = layers.GetAttentionHiddens(embedding_dim, opt['prealign_hidden'],
        #                                                 similarity_attention=True)
        #     print('pre_align:', embedding_dim, opt['prealign_hidden'])
        #     doc_input_size += embedding_dim
        if opt['no_em']:
            print('no_em_num_features:', opt['num_features'] - 3)
            doc_input_size += opt['num_features'] - 3
        else:
            print('num_features:', opt['num_features'])
            doc_input_size += opt['num_features']

        # Setup the vector size for [doc, question]
        # they will be modified in the following code
        doc_hidden_size, que_hidden_size = doc_input_size, que_input_size
        # Initially, the vector_sizes [doc, query] are 2252 1924
        print('Initially, the vector_sizes [doc, query] are', doc_hidden_size,
              que_hidden_size)
        self.Wqac = nn.Linear(que_hidden_size * 2, 250)
        if opt['cuda']:
            self.Wqac = self.Wqac.cuda()
        self.CselfAttn = myModel.SelfAttLayer(que_hidden_size + 20, opt)
        self.QselfAttn = myModel.SelfAttLayer(que_hidden_size, opt)
        x1_d = que_hidden_size + 20
        x2_d = que_hidden_size * 2
        attention_hidden_size = que_hidden_size
        self.QACAttn = myModel.QuestionAwareContextLayer(
            myModel.AwareIntegration(
                myModel.AttentionScore(x1_d, x2_d, attention_hidden_size, opt),
                opt), opt)
        # default = 125
        flow_size = opt['hidden_size']

        # RNN question encoder
        self.question_rnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size,
            opt['hidden_size'],
            opt,
            num_layers=2,
            concat_rnn=opt['concat_rnn'],
            add_feat=CoVe_size)

        # Question understanding and compression
        self.high_lvl_qrnn, que_hidden_size = layers.RNN_from_opt(
            que_hidden_size * 2,
            opt['hidden_size'],
            opt,
            num_layers=1,
            concat_rnn=True)

        # Self attention on context
        att_size = doc_hidden_size + 2 * opt['hidden_size'] * 2
        # default=1
        if opt['self_attention_opt'] > 0:
            self.highlvl_self_att = layers.GetAttentionHiddens(
                att_size, opt['deep_att_hidden_size_per_abstr'])
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size * 2 + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)
            print('Self deep-attention {} rays in {}-dim space'.format(
                opt['deep_att_hidden_size_per_abstr'], att_size))
        elif opt['self_attention_opt'] == 0:
            self.high_lvl_crnn, doc_hidden_size = layers.RNN_from_opt(
                doc_hidden_size + flow_size,
                opt['hidden_size'],
                opt,
                num_layers=1,
                concat_rnn=False)

        print('Before answer span finding, hidden size are', doc_hidden_size,
              que_hidden_size)

        # Question merging
        self.self_attn = layers.LinearSelfAttn(que_hidden_size)
        # default = true
        if opt['do_hierarchical_query']:
            self.hier_query_rnn = layers.StackedBRNN(que_hidden_size,
                                                     opt['hidden_size'],
                                                     num_layers=1,
                                                     rnn_type=nn.GRU,
                                                     bidir=False)
            que_hidden_size = opt['hidden_size']

        # Attention for span start/end
        print('doc_hidden_size gsse', doc_hidden_size, que_hidden_size, opt,
              opt['ptr_net_indep_attn'], opt["ptr_net_attn_type"],
              opt['do_ptr_update'])
        self.get_answer = layers.GetSpanStartEnd(doc_hidden_size,
                                                 que_hidden_size, opt,
                                                 opt['ptr_net_indep_attn'],
                                                 opt["ptr_net_attn_type"],
                                                 opt['do_ptr_update'])

        self.ans_type_prediction = layers.BilinearLayer(
            doc_hidden_size * 2, que_hidden_size, opt['answer_type_num'])

        # Store config
        self.opt = opt

    # context_id, context_cid, context_feature, context_tag, context_ent, context_mask,
    # question_id, question_cid, question_mask
    # @pysnooper.snoop(watch=('doc_hiddens.size()', 'x1_input.size()', 'x2_input.size()'))
    def forward(self, x1, x1_c, x1_f, x1_pos, x1_ner, x1_mask, x2_full, x2_c,
                x2_full_mask, overall_mask):
        """Inputs:
        x1 = document word indices                                          [batch * len_d]
        x1_c = document char indices,elmo生成的,每个word长度50             [batch * len_d * len_w] or [1]
        x1_f = document word features indices                               [batch * q_num * len_d * nfeat]
        x1_pos = document POS tags                                          [batch * len_d]
        x1_ner = document entity tags                                       [batch * len_d]
        x1_mask = document padding mask                                     [batch * len_d]   0表示有,1表示不存在
        x2_full = question word indices                                     [batch * q_num * len_q]
        x2_c = question char indices elmo生成的,每个word长度50             [(batch * q_num) * len_q * len_w]
        x2_full_mask = question padding mask                                [batch * q_num * len_q]0表示有,1表示不存在
        overall_mask = overmask[i][j]=0,if ith context's jth question exists
        """
        # print('in forward:x1_c size:',x1_c.size(),x1_c)
        # precomputing ELMo is only for context (to speedup computation)
        if self.opt['use_elmo'] and self.opt['elmo_batch_size'] > self.opt[
                'batch_size']:  # precomputing ELMo is used
            if x1_c.dim() != 1:  # precomputation is needed
                precomputed_bilm_output = self.elmo._elmo_lstm(x1_c)
                self.precomputed_layer_activations = [
                    t.detach().cpu()
                    for t in precomputed_bilm_output['activations']
                ]
                self.precomputed_mask_with_bos_eos = precomputed_bilm_output[
                    'mask'].detach().cpu()
                self.precomputed_cnt = 0

            # get precomputed ELMo
            layer_activations = [
                t[x1.size(0) * self.precomputed_cnt:x1.size(0) *
                  (self.precomputed_cnt + 1), :, :]
                for t in self.precomputed_layer_activations
            ]
            mask_with_bos_eos = self.precomputed_mask_with_bos_eos[
                x1.size(0) * self.precomputed_cnt:x1.size(0) *
                (self.precomputed_cnt + 1), :]
            if x1.is_cuda:
                layer_activations = [t.cuda() for t in layer_activations]
                mask_with_bos_eos = mask_with_bos_eos.cuda()

            representations = []
            for i in range(len(self.elmo._scalar_mixes)):
                scalar_mix = getattr(self.elmo, 'scalar_mix_{}'.format(i))
                representation_with_bos_eos = scalar_mix(
                    layer_activations, mask_with_bos_eos)
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                representations.append(
                    self.elmo._dropout(representation_without_bos_eos))

            x1_elmo = representations[0][:, :x1.size(1), :]
            self.precomputed_cnt += 1

            precomputed_elmo = True
        else:
            precomputed_elmo = False
        """
        x1_full = document word indices        [batch * q_num * len_d]
        x1_full_mask = document padding mask   [batch * q_num * len_d]
        """
        x1_full = x1.unsqueeze(1).expand(x2_full.size(0), x2_full.size(1),
                                         x1.size(1)).contiguous()
        x1_full_mask = x1_mask.unsqueeze(1).expand(x2_full.size(0),
                                                   x2_full.size(1),
                                                   x1.size(1)).contiguous()

        drnn_input_list, qrnn_input_list = [], []

        x2 = x2_full.view(-1, x2_full.size(-1))
        x2_mask = x2_full_mask.view(-1, x2_full.size(-1))

        if self.opt['use_wemb']:
            # Word embedding for both document and question
            emb = self.embedding if self.training else self.eval_embed
            x1_emb = emb(x1)
            x2_emb = emb(x2)
            # Dropout on embeddings default:0.4
            if self.opt['dropout_emb'] > 0:
                x1_emb = layers.dropout(x1_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)
                x2_emb = layers.dropout(x2_emb,
                                        p=self.opt['dropout_emb'],
                                        training=self.training)

            drnn_input_list.append(x1_emb)
            qrnn_input_list.append(x2_emb)

        if self.opt['CoVe_opt'] > 0:
            x1_cove_mid, x1_cove_high = self.CoVe(x1, x1_mask)
            x2_cove_mid, x2_cove_high = self.CoVe(x2, x2_mask)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_cove_mid = layers.dropout(x1_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x1_cove_high = layers.dropout(x1_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)
                x2_cove_mid = layers.dropout(x2_cove_mid,
                                             p=self.opt['dropout_emb'],
                                             training=self.training)
                x2_cove_high = layers.dropout(x2_cove_high,
                                              p=self.opt['dropout_emb'],
                                              training=self.training)

            drnn_input_list.append(x1_cove_mid)
            qrnn_input_list.append(x2_cove_mid)

        if self.opt['use_elmo']:
            if not precomputed_elmo:
                x1_elmo = self.elmo(x1_c)['elmo_representations'][
                    0]  # torch.zeros(x1_emb.size(0), x1_emb.size(1), 1024, dtype=x1_emb.dtype, layout=x1_emb.layout, device=x1_emb.device)
            x2_elmo = self.elmo(x2_c)['elmo_representations'][
                0]  # torch.zeros(x2_emb.size(0), x2_emb.size(1), 1024, dtype=x2_emb.dtype, layout=x2_emb.layout, device=x2_emb.device)
            # Dropout on contexualized embeddings
            if self.opt['dropout_emb'] > 0:
                x1_elmo = layers.dropout(x1_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)
                x2_elmo = layers.dropout(x2_elmo,
                                         p=self.opt['dropout_emb'],
                                         training=self.training)

            drnn_input_list.append(x1_elmo)
            qrnn_input_list.append(x2_elmo)

        if self.opt['use_pos']:
            x1_pos_emb = self.pos_embedding(x1_pos)
            drnn_input_list.append(x1_pos_emb)

        if self.opt['use_ner']:
            x1_ner_emb = self.ner_embedding(x1_ner)
            drnn_input_list.append(x1_ner_emb)

        x1_input = torch.cat(drnn_input_list, dim=2)
        x2_input = torch.cat(qrnn_input_list, dim=2)
        x1_input = self.CselfAttn(x1_input)
        x2_input = self.QselfAttn(x2_input)

        def expansion_for_doc(z):
            return z.unsqueeze(1).expand(z.size(0), x2_full.size(1), z.size(1),
                                         z.size(2)).contiguous().view(
                                             -1, z.size(1), z.size(2))

        def genTagsfromoverallmask(ovm):
            QCTags = torch.LongTensor(
                overall_mask.size(0) * overall_mask.size(-1)).fill_(0)
            s = 0
            for i, row in enumerate(ovm):
                QCTags[s:s + ovm[i].sum().item()] = i
                s += ovm[i].sum().item()
            return QCTags

        def test(c, ques, tags):
            QAC = myModel.QuestionAwareContextLayer(contexts=c,
                                                    questions=ques,
                                                    tags=tags,
                                                    opt=self.opt).forward()
            QAC = self.Wqac(QAC)
            return QAC

        # Encode question with RNN
        _, que_abstr_ls = self.question_rnn(x2_input,
                                            x2_mask,
                                            return_list=True,
                                            additional_x=x2_cove_high)

        # Final question layer
        question_hiddens = self.high_lvl_qrnn(torch.cat(que_abstr_ls, 2),
                                              x2_mask)
        que_abstr_ls += [question_hiddens]

        doc_hiddens = self.Wqac(
            self.QACAttn(x1_input, x2_input,
                         genTagsfromoverallmask(overall_mask)))

        # Merge the question hidden vectors
        q_merge_weights = self.self_attn(question_hiddens, x2_mask)
        question_avg_hidden = layers.weighted_avg(question_hiddens,
                                                  q_merge_weights)
        if self.opt['do_hierarchical_query']:
            question_avg_hidden = self.hier_query_rnn(
                question_avg_hidden.view(x1_full.size(0), x1_full.size(1), -1))
            question_avg_hidden = question_avg_hidden.contiguous().view(
                -1, question_avg_hidden.size(-1))
        # Get Start, End span
        x1_mask = x1_full_mask.view(-1, x1_full_mask.size(-1))

        start_scores, end_scores = self.get_answer(doc_hiddens,
                                                   question_avg_hidden,
                                                   x1_mask)
        all_start_scores = start_scores.view_as(
            x1_full)  # batch x q_num x len_d
        all_end_scores = end_scores.view_as(x1_full)  # batch x q_num x len_d

        # Get whether there is an answer
        doc_avg_hidden = torch.cat(
            (torch.max(doc_hiddens, dim=1)[0], torch.mean(doc_hiddens, dim=1)),
            dim=1)
        class_scores = self.ans_type_prediction(doc_avg_hidden,
                                                question_avg_hidden)
        all_class_scores = class_scores.view(x1_full.size(0), x1_full.size(1),
                                             -1)  # batch x q_num x class_num
        all_class_scores = all_class_scores.squeeze(-1)  # when class_num = 1
        return all_start_scores, all_end_scores, all_class_scores