예제 #1
0
    def forward(self, context, question, con_lens, qu_lens, step=0):
        context_t = context.transpose(0, 1).contiguous()  # (batch, T, 2d)
        context_size = context.size()
        question_size = question.size()
        context_len = context_size[0]
        question_len = question_size[0]
        S = Variable(torch.zeros(context_size[1], context_len,
                                 question_len))  # (batch, T, J)
        if config['USE_CUDA']:
            S = S.cuda(context.get_device())
        for t in range(context_len):
            for j in range(question_len):
                c = context[t, :, :]  # (batch, 2d)
                q = question[j, :, :]  # (batch, 2d)
                cat = torch.cat([c, q, c * q], dim=-1)
                att = torch.mm(cat, self.att_w)  # (batch, 1)
                S[:, t, j] = att
        S = exp_mask(S, con_lens, qu_lens)
        if config['sigmoid']:
            c2q_att_w = torch.sigmoid(
                S.view(-1, question_len).contiguous()).view(
                    context_size[1], context_len,
                    question_len).contiguous()  # (batch, T, J)
        else:
            c2q_att_w = self.softmax(
                S.view(-1, question_len).contiguous()).view(
                    context_size[1], context_len,
                    question_len).contiguous()  # (batch, T, J)

        c2q_att = torch.bmm(c2q_att_w,
                            question.transpose(
                                0, 1).contiguous())  # U~: (batch, T, 2d)
        value, index = torch.max(S, 2)  # value(batch, T)
        # if config['sigmoid']:
        #     value = torch.sigmoid(value)
        # else:
        value = self.softmax(value)
        value = value.unsqueeze(1).expand(
            context_size[1], context_size[0],
            context_size[0])  # value(batch, T, T)
        q2c_att = torch.bmm(value, context_t)  # H~: (batch, T, 2d)
        G = torch.cat(
            [context_t, c2q_att, context_t * c2q_att, context_t * q2c_att],
            dim=2)
        mask = Variable(
            get_source_mask(context_size[1], context_size[2] * 4,
                            context_size[0], con_lens))
        mask = mask.transpose(0, 1)
        if config['USE_CUDA']:
            mask = mask.cuda(context.get_device())
        G = G * mask
        # logger.histo_summary('AttentionFlowLayer/output', to_np(G), step)
        if config['gate']:
            gate = torch.matmul(G, self.gate_weight)
            gate = torch.sigmoid(gate)
            G = gate * G
        return G  # (batch, T, 8d)
예제 #2
0
 def forward(self, h_0, G, con_lens, step=0):
     G_t = G.transpose(0, 1).contiguous()
     M, h_n = self.rnn(G_t, h_0)  # M: (T, batch, 2d)
     # logger.histo_summary('ModelingOutLayer/rnn_output', to_np(M), step)
     size = M.size()
     mask = Variable(get_source_mask(size[1], size[2], size[0], con_lens))
     if config['USE_CUDA']:
         mask = mask.cuda(M.get_device())
     M = M * mask
     M = self.dropout(M)
     return M.transpose(0, 1).contiguous()  # M: (batch, T, 2d)
예제 #3
0
 def forward(self, l_ctx_emb, r_ctx_emb, l_ctx_lens, r_ctx_lens):
     """
     args:
         l_ctx_emb: (B, S, word_emb)
         r_ctx_emb: (B, S, word_emb)
         l_ctx_lens: list
         r_ctx_lens: list
     :return:
     """
     batch_size = l_ctx_emb.size(0)
     h_0 = Variable(
         torch.zeros(fg_config['lstm_layers'] * 2, batch_size,
                     fg_config['hidden_size']))
     c_0 = Variable(
         torch.zeros(fg_config['lstm_layers'] * 2, batch_size,
                     fg_config['hidden_size']))
     if fg_config['USE_CUDA']:
         h_0 = h_0.cuda(fg_config['cuda_num'])
         c_0 = c_0.cuda(fg_config['cuda_num'])
     # (S, B, hidden_size*2)
     l_ctx_lstm, _ = self.l_lstm(l_ctx_emb.transpose(0, 1), (h_0, c_0))
     # (S, B, hidden_size*2)
     r_ctx_lstm, _ = self.r_lstm(r_ctx_emb.transpose(0, 1), (h_0, c_0))
     l_mask = Variable(
         get_source_mask(batch_size, fg_config['hidden_size'] * 2,
                         fg_config['ctx_window_size'], l_ctx_lens))
     r_mask = Variable(
         get_source_mask(batch_size, fg_config['hidden_size'] * 2,
                         fg_config['ctx_window_size'], r_ctx_lens))
     if fg_config['USE_CUDA']:
         l_mask = l_mask.cuda(fg_config['cuda_num'])
         r_mask = r_mask.cuda(fg_config['cuda_num'])
     l_ctx_lstm = l_ctx_lstm * l_mask
     r_ctx_lstm = r_ctx_lstm * r_mask
     l_ctx_lstm = self.dropout(l_ctx_lstm)
     r_ctx_lstm = self.dropout(r_ctx_lstm)
     # (S, B, hidden_size*2)
     return l_ctx_lstm, r_ctx_lstm
예제 #4
0
 def forward(self, input, h_0, seq_length, step=0, name='Q'):
     input_size = input.size()
     word_slice = input[:, :, 0]
     word_emb = self.word_embedding(
         word_slice)  # (batch, seq_length, word_emb)
     trans_emb = nn.functional.relu(self.linear(word_emb))
     out = self.dropout(trans_emb).transpose(0,
                                             1)  # (seq_length, batch, 2*d)
     mask = Variable(
         get_source_mask(input_size[0], config['hidden_size'] * 2,
                         input_size[1], seq_length))
     if config['USE_CUDA']:
         mask = mask.cuda(input.get_device())
     out = out * mask
     return out
예제 #5
0
 def forward(self, M, G, h_0, con_lens):
     """
     :param M: (batch, T, 2d)
     :param G: (batch, T, 8d)
     :param h_0: (num_layers * num_directions, batch, hidden_size)
     :return:  (batch, T)
     """
     M_t = M.transpose(0, 1)
     M_2, h_n = self.rnn(M_t, h_0)  # M_2:(T, batch, 2d)
     size = M_2.size()
     mask = Variable(get_source_mask(size[1], size[2], size[0], con_lens))
     if config['USE_CUDA']:
         mask = mask.cuda(M.get_device())
     M_2 = M_2 * mask
     M_2 = self.dropout(M_2)
     cat = torch.cat([G, M_2.transpose(0, 1)], 2)  # cat: (batch, T, 10d)
     logits = self.W(cat)  # (batch, T, 1)
     logits = self.dropout(logits)
     logits = logits.squeeze(2)
     logits = exp_mask_2d(logits, con_lens)
     probs = self.logSoftmax(logits)  # (batch, T)
     return probs
예제 #6
0
def crf_eval_one_sen(config, encoder, bidencoder, decoder, this_batch):
    this_batch_num = len(this_batch[3])
    this_batch_max_seq = max(this_batch[3])
    last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size']))
    bid_init_hidden = Variable(torch.zeros(config['decoder_layers'] * 2, this_batch_num, config['hidden_size']))
    word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor))

    data = Variable(this_batch[0], volatile=True)
    # target = Variable(this_batch[1], volatile=True)
    length = Variable(torch.LongTensor(this_batch[3]), volatile=True)
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2),
                   volatile=True)  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        # target = target.cuda(config['cuda_num'])
        length = length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])
        bid_init_hidden = bid_init_hidden.cuda(config['cuda_num'])

    encoder_outputs = encoder(0, data, h_0, this_batch[3])
    # encoder_outputs = encoder_outputs.transpose(1, 2)
    # encoder_outputs = encoder_outputs.transpose(0, 1)
    source_mask = Variable(
        get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask
    encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3])
    crf_mask = Variable(
        get_target_mask(this_batch_num, max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        crf_mask = crf_mask.type(torch.cuda.ByteTensor)
    else:
        crf_mask = crf_mask.type(torch.ByteTensor)
    lst_decode = decoder(encoder_outputs, crf_mask)
    return lst_decode
예제 #7
0
def train_iteration(logger, config, my_arg, step,
                    encoder, bidencoder, decoder, encoder_optimizer, bidencoder_optimizer, decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_target = max(this_batch[2])
    last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size']))
    bid_init_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size']))
    word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor))
    print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target  # (output_size, B, 1)

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    target_length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2))  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        target_length = target_length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])
        bid_init_hidden = bid_init_hidden.cuda(config['cuda_num'])

    encoder_outputs = encoder(step, data, h_0, this_batch[3])
    source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask
    encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3])

    seq_label_prob = Variable(torch.zeros(this_batch_max_target, this_batch_num, config['decoder_output_size']))
    if config['USE_CUDA']:
        seq_label_prob = seq_label_prob.cuda(config['cuda_num'])

    rate = schedule_samp_rate(step)
    # rate=0
    for time_step in range(this_batch_max_target):
        label_logits, cur_hidden = decoder(step, word_input, last_hidden, encoder_outputs[time_step])
        last_hidden = cur_hidden
        seq_label_prob[time_step] = label_logits
        # Choose top word from label_prob
        # value, label = label_prob.topk(1)
        # decoder_out_label.append(label)
        # not teacher-forcing
        # word_input = label

        # teacher-forcing
        if my_arg == 0:
            word_input = target[:, time_step]
        else:
            # value, label = label_logits.data.topk(1)
            # decoder_out_label.append(label)
            # word_input = Variable(label)  # Chosen word is next input
            # if config['USE_CUDA']:
            #     word_input = word_input.cuda(config['cuda_num'])
            a = random_pick([0, 1], [rate, 1 - rate])
            if a == 0:
                word_input = target[:, time_step]
            else:
                value, label = label_logits.data.topk(1)
                # decoder_out_label.append(label)
                word_input = Variable(label)  # Chosen word is next input
                if config['USE_CUDA']:
                    word_input = word_input.cuda(config['cuda_num'])

    loss = masked_cross_entropy(seq_label_prob.transpose(0, 1).contiguous(), target, target_length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    b_before_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()]
    d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]

    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    decoder_optimizer.step()
    encoder_optimizer.step()
    bidencoder_optimizer.step()
    e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    b_after_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()]
    d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]
    for before, after in zip(e_before_step, e_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
    for before, after in zip(b_before_step, b_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)

    for before, after in zip(d_before_step, d_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
예제 #8
0
def train_iteration(step, encoder, decoder, encoder_optimizer,
                    decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_seq = max(this_batch[2])
    last_hidden = Variable(
        torch.zeros(config['decoder_layers'], this_batch_num,
                    config['hidden_size']))
    word_input = Variable(
        torch.zeros(this_batch_num, 1).type(torch.LongTensor))

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] /
                               2))  # encoder gru initial hidden state
    print 'seq_length', max(
        this_batch[3]
    ), 'label_length', this_batch_max_seq  # (output_size, B, 1)

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['multi_cuda'][0])
        word_input = word_input.cuda(config['multi_cuda'][0])
        data = data.cuda(config['multi_cuda'][0])
        target = target.cuda(config['multi_cuda'][0])
        length = length.cuda(config['multi_cuda'][0])
        h_0 = h_0.cuda(config['multi_cuda'][0])

    step = Variable(torch.ones(len(config['multi_cuda']), 1) * step)
    h_0 = h_0.transpose(0, 1)
    batch_seq_len = Variable(torch.LongTensor(this_batch[3])).unsqueeze(1)
    if config['use_multi']:
        step = step.cuda(config['multi_cuda'][0])
        batch_seq_len = batch_seq_len.cuda(config['multi_cuda'][0])
    encoder_outputs = encoder(step, data, h_0, batch_seq_len)
    encoder_outputs = encoder_outputs.transpose(0, 1)
    # encoder_outputs = encoder_outputs.transpose(1,2)
    # encoder_outputs = encoder_outputs.transpose(0,1)
    source_mask = Variable(
        get_source_mask(this_batch_num, config['encoder_filter_num'],
                        max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['multi_cuda'][0])
    encoder_outputs = encoder_outputs * source_mask

    # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers'])
    # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    # word_input = Variable(torch.LongTensor([[0], [1]]))
    # target = Variable(torch.LongTensor([[1,0,1,0,1,0,1,0],[0,1,0,1,0,1,0,1]]))  # (batch, max_label_length)

    # length = Variable(torch.LongTensor([5,7]))

    # decoder.cuda(config['cuda_num'])

    # train
    decoder_out_label = []
    encoder_outputs = encoder_outputs.transpose(0, 1)
    seq_label_prob = Variable(
        torch.zeros(this_batch_max_seq, this_batch_num,
                    config['decoder_output_size']))
    if config['USE_CUDA']:
        seq_label_prob = seq_label_prob.cuda(config['multi_cuda'][0])
    for time_step in range(this_batch_max_seq):
        last_hidden = last_hidden.transpose(0, 1)
        label_prob, cur_hidden, attn_weights = decoder(step, word_input,
                                                       last_hidden,
                                                       encoder_outputs)
        cur_hidden = cur_hidden.transpose(0, 1)
        last_hidden = cur_hidden
        seq_label_prob[time_step] = label_prob
        # Choose top word from label_prob
        # value, label = label_prob.topk(1)
        # decoder_out_label.append(label.data)
        # not teacher-forcing
        # word_input = label

        # teacher-forcing
        word_input = target[:, time_step]
    decoder_prob = Variable(
        torch.FloatTensor([[[0, 1], [1, 0]], [[0, 1], [1, 0]], [[1, 0], [0,
                                                                         1]],
                           [[1, 0], [0, 1]], [[0, 1], [1, 0]], [[0, 1], [1,
                                                                         0]],
                           [[1, 0], [0, 1]], [[1, 0], [0, 1]]]))

    if config['USE_CUDA']:
        decoder_prob = decoder_prob.cuda(config['multi_cuda'][0])

    loss = masked_cross_entropy(
        seq_label_prob.transpose(0, 1).contiguous(), target, length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    # logger.scalar_summary('loss', loss.data[0], step.data[0, 0])
    loss.backward()
    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    # for tag, value in encoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     tag = tag.replace('module', 'encoder')
    #     logger.histo_summary(tag, to_np(value), step.data[0, 0])
    #     logger.histo_summary(tag + '/grad', to_np(value.grad), step.data[0, 0])
    # for tag, value in decoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     tag = tag.replace('module', 'decoder')
    #     logger.histo_summary(tag, to_np(value), step.data[0, 0])
    #     logger.histo_summary(tag + '/grad', to_np(value.grad), step.data[0, 0])
    decoder_optimizer.step()
    encoder_optimizer.step()
예제 #9
0
    def forward(self,
                input,
                h_0,
                seq_length,
                word_embedding,
                gemb,
                char_emb,
                char_conv,
                highway,
                step=0,
                name='Q'):
        input_size = input.size()

        outs = []
        word_slice = input[:, :, 0]
        word_emb = word_embedding(word_slice)  # (batch, seq_length, word_emb)
        outs.append(word_emb)
        curr_end = 1

        if self._use_gaz:
            gazStart = curr_end
            gazEnd = gazStart + self.gazsize
            if config['USE_CUDA']:
                a = input[:, :, gazStart:gazEnd]
                b = a.type(torch.cuda.FloatTensor)
                c = b.view(-1, self.gazsize)
                d = gemb(c)
                e = d.view(input_size[0], input_size[1], -1)
                f = e.contiguous()
                outs.append(f)
                # outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.cuda.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous())
            else:
                outs.append(
                    self.gemb(input[:, :, gazStart:gazEnd].type(
                        torch.FloatTensor).view(-1, self.gazsize)).view(
                            input_size[0], input_size[1], -1).contiguous())
            curr_end = gazEnd

        if self._use_char_conv:
            chars = input[:, :, curr_end:curr_end + self.char_len].contiguous()
            chars_mask = input[:, :,
                               (curr_end + self.char_len):(curr_end +
                                                           2 * self.char_len)]
            if config['USE_CUDA']:
                chars_mask = chars_mask.type(torch.cuda.FloatTensor)
            else:
                chars_mask = chars_mask.type(torch.FloatTensor)
            chars_size = chars.size()
            char_view = chars.view(
                -1, self.char_len)  # (B*seq_length, max_char_len)
            char_emb_out = char_emb(
                char_view)  # (B*seq_length, max_char_len, char_emb)
            chars_mask = chars_mask.view(
                -1, self.char_len)  # (B*seq_length, max_char_len)
            char_emb_out = char_emb_out * chars_mask.unsqueeze(2).expand_as(
                char_emb_out)

            # char_shape = char_emb_out.shape
            # char_emb_out = char_emb_out.reshape((char_shape[0] * char_shape[1], char_shape[2], 1, char_shape[3]))
            # char_conv_out = self.char_conv.apply(char_emb_out)
            # char_conv_out = self.conv_active.apply(char_conv_out)
            # char_conv_out = char_conv_out.reshape(char_shape)
            # char_conv_out = char_conv_out * chars_mask.dimshuffle(0, 1, 2, 'x')
            # char_conv_out = tensor.max(char_conv_out, axis=2)

            char_emb_out = char_emb_out.transpose(
                1, 2)  # (B*seq_length, char_emb, char_len)
            char_conv_out = char_conv(
                char_emb_out)  # (B*seq_length, out_channel, char_len)
            char_conv_out = self.conv_active(char_conv_out)
            char_conv_out = char_conv_out.transpose(
                1, 2)  # (B*seq_length, char_len, out_channel)
            char_conv_out = char_conv_out * chars_mask.unsqueeze(2).expand_as(
                char_conv_out)  # (B*seq_length, char_len, out_channel)
            char_conv_out, _ = torch.max(char_conv_out, 1)
            char_conv_out = char_conv_out.view(
                chars_size[0], chars_size[1],
                -1)  # (B, seq_length, out_channel)
            outs.append(char_conv_out)
        output = torch.cat(outs, dim=-1)
        mask = Variable(
            get_source_mask(input_size[0], self.out_dim, input_size[1],
                            seq_length))
        if config['USE_CUDA']:
            mask = mask.cuda(input.get_device())
        mask = mask.transpose(0, 1)
        embedded = output * mask  # embedded: (batch, seq_length, emb_size)
        embedded = embedded.view(-1, self.out_dim).contiguous()
        embedded = highway(embedded)
        embedded = embedded.view(input_size[0], input_size[1], -1).contiguous()
        embedded = embedded * mask
        embedded = embedded.transpose(
            0, 1).contiguous()  # embedded: (seq_length, batch, emb_size)
        embedded = self.question_trans(embedded)
        # rnn_output, h_n = self.encode_rnn(embedded, h_0)
        rnn_mask = Variable(
            get_source_mask(input_size[0], self.out_dim * 2, input_size[1],
                            seq_length))
        if config['USE_CUDA']:
            rnn_mask = rnn_mask.cuda(input.get_device())
        rnn_output = embedded * rnn_mask
        if config['use_dropout']:
            rnn_output = self.dropout(rnn_output)
        # logger.histo_summary('EmbeddingLayer/output', to_np(rnn_output), step)
        return rnn_output  # (seq_len, batch, hidden_size(100+100=d) * num_directions(2))
예제 #10
0
def train_iteration(logger, step, embedding_layer, q_word_embedding,
                    q_emb_layer, att_layer, model_layer, ner_hw_layer,
                    ner_out_layer, crf, emb_opt, q_emb_opt, att_opt, model_opt,
                    ner_hw_opt, ner_out_opt, crf_opt, this_batch):
    if not config['freeze']:
        emb_opt.zero_grad()
        att_opt.zero_grad()
        model_opt.zero_grad()
    if config['question_alone']:
        q_emb_opt.zero_grad()
    ner_out_opt.zero_grad()
    crf_opt.zero_grad()
    ner_hw_opt.zero_grad()

    d = config['hidden_size']
    this_batch_num = len(this_batch[2])

    question = Variable(this_batch[4])
    question_lengths = this_batch[5]
    context = Variable(this_batch[0])  # (batch, T, 51)
    context_lengths = this_batch[3]  # list
    target = Variable(this_batch[1])  # (batch, T)
    emb_h_0 = Variable(torch.zeros(2, this_batch_num, d))
    model_out_h_0 = Variable(
        torch.zeros(2 * model_layer.num_layers, this_batch_num, d))
    con_lens_var = Variable(torch.LongTensor(context_lengths))

    if config['USE_CUDA']:
        question = question.cuda(config['cuda_num'])
        context = context.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        emb_h_0 = emb_h_0.cuda(config['cuda_num'])
        model_out_h_0 = model_out_h_0.cuda(config['cuda_num'])
        con_lens_var = con_lens_var.cuda(config['cuda_num'])

    c_emb = embedding_layer(context, emb_h_0, context_lengths, step, name='C')
    if config['question_alone']:
        q_emb = q_emb_layer(question,
                            emb_h_0,
                            question_lengths,
                            step,
                            name='Q')
    else:
        q_emb = embedding_layer(question, emb_h_0, question_lengths, step,
                                q_word_embedding, 'Q')
    G = att_layer(c_emb, q_emb, context_lengths, question_lengths, step)
    M = model_layer(model_out_h_0, G, context_lengths, step)
    if config['not_pretrain']:
        M_trans = M
        G_trans = G
    else:
        M_trans, G_trans = ner_hw_layer(M, G)
    prob = ner_out_layer(M_trans, G_trans, context_lengths)
    prob_size = prob.size()
    mask = Variable(
        get_source_mask(prob_size[0], prob_size[2], prob_size[1],
                        context_lengths))
    mask = mask.transpose(0, 1)
    if config['USE_CUDA']:
        mask = mask.cuda(context.get_device())
    prob = prob * mask
    crf_mask = Variable(
        get_target_mask(this_batch_num, max(context_lengths), context_lengths))
    if config['USE_CUDA']:
        crf_mask = crf_mask.type(torch.cuda.ByteTensor)
        crf_mask = crf_mask.cuda(config['cuda_num'])
    else:
        crf_mask = crf_mask.type(torch.ByteTensor)
    loss = crf.neg_log_likelihood(
        prob.transpose(0, 1).contiguous(), target.transpose(0, 1), crf_mask,
        context_lengths)
    # loss = masked_cross_entropy(prob, target, con_lens_var)
    if step % 100 == 0:
        print('loss: ', loss.data[0])
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    # e_before_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()]
    # a_before_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()]
    # m_before_step = [(tag, to_np(value)) for tag, value in model_layer.named_parameters()]
    # h_before_step = [(tag, to_np(value)) for tag, value in ner_hw_layer.named_parameters()]
    # n_before_step = [(tag, to_np(value)) for tag, value in ner_out_layer.named_parameters()]
    # c_before_step = [(tag, to_np(value)) for tag, value in crf.named_parameters()]
    # q_before_step = [(tag, to_np(value)) for tag, value in q_emb_layer.named_parameters()]

    clip_grad_norm(embedding_layer.parameters(), config['clip_norm'])
    clip_grad_norm(att_layer.parameters(), config['clip_norm'])
    clip_grad_norm(model_layer.parameters(), config['clip_norm'])
    clip_grad_norm(ner_hw_layer.parameters(), config['clip_norm'])
    clip_grad_norm(ner_out_layer.parameters(), config['clip_norm'])
    clip_grad_norm(crf.parameters(), config['clip_norm'])
    if config['question_alone']:
        clip_grad_norm(q_emb_layer.parameters(), config['clip_norm'])
    # for tag, value in embedding_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in att_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in model_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)

    # for tag, value in ner_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    #
    # for tag, value in crf.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    if not config['freeze']:
        emb_opt.step()
        att_opt.step()
        model_opt.step()
    ner_hw_opt.step()
    ner_out_opt.step()
    crf_opt.step()
    if config['question_alone']:
        q_emb_opt.step()

    grad_ratio_lst = []
예제 #11
0
def train_iteration(logger, config, my_arg, step, encoder, decoder,
                    encoder_optimizer, decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_target = max(this_batch[2])
    last_hidden = Variable(
        torch.zeros(config['decoder_layers'], this_batch_num,
                    config['hidden_size']))
    word_input = Variable(
        torch.zeros(this_batch_num, 1).type(torch.LongTensor))
    print 'seq_length', max(
        this_batch[3]
    ), 'label_length', this_batch_max_target  # (output_size, B, 1)

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    target_length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] /
                               2))  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        target_length = target_length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])

    encoder_outputs = encoder(step, data, h_0, this_batch[3])
    # encoder_outputs = encoder_outputs.transpose(1,2)
    # encoder_outputs = encoder_outputs.transpose(0,1)
    source_mask = Variable(
        get_source_mask(this_batch_num, config['encoder_filter_num'],
                        max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask

    # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers'])
    # decoder_optimizer = torch.optim.Adadelta(decoder.parameters())
    # encoder_optimizer = torch.optim.Adadelta(encoder.parameters())
    # word_input = Variable(torch.LongTensor([[0], [1]]))
    # target = Variable(torch.LongTensor([[1,0,1,0,1,0,1,0],[0,1,0,1,0,1,0,1]]))  # (batch, max_label_length)

    # length = Variable(torch.LongTensor([5,7]))

    # decoder.cuda(config['cuda_num'])

    # train
    decoder_out_label = []
    seq_label_prob = Variable(
        torch.zeros(this_batch_max_target, this_batch_num,
                    config['decoder_output_size']))
    if config['USE_CUDA']:
        seq_label_prob = seq_label_prob.cuda(config['cuda_num'])

    rate = schedule_samp_rate(step)
    # rate=0
    for time_step in range(this_batch_max_target):
        label_logits, cur_hidden = decoder(step, word_input, last_hidden,
                                           encoder_outputs[time_step])
        last_hidden = cur_hidden
        seq_label_prob[time_step] = label_logits
        # Choose top word from label_prob
        # value, label = label_prob.topk(1)
        # decoder_out_label.append(label)
        # not teacher-forcing
        # word_input = label

        # teacher-forcing
        if my_arg == 0:
            word_input = target[:, time_step]
        else:
            # value, label = label_logits.data.topk(1)
            # # decoder_out_label.append(label)
            # word_input = Variable(label)  # Chosen word is next input
            # if config['USE_CUDA']:
            #     word_input = word_input.cuda(config['cuda_num'])
            a = random_pick([0, 1], [rate, 1 - rate])
            if a == 0:
                word_input = target[:, time_step]
            else:
                value, label = label_logits.data.topk(1)
                # decoder_out_label.append(label)
                word_input = Variable(label)  # Chosen word is next input
                if config['USE_CUDA']:
                    word_input = word_input.cuda(config['cuda_num'])

    # decoder_prob = Variable(torch.FloatTensor([[[0,1],[1,0]],[[0,1],[1,0]],[[1,0],[0,1]],[[1,0],[0,1]], [[0,1],[1,0]],[[0,1],[1,0]],[[1,0],[0,1]],[[1,0],[0,1]]]))
    #
    # if config['USE_CUDA']:
    #     decoder_prob = decoder_prob.cuda(config['cuda_num'])

    loss = masked_cross_entropy(
        seq_label_prob.transpose(0, 1).contiguous(), target, target_length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    # e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    # d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]

    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    # for tag, value in encoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in decoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    decoder_optimizer.step()
    encoder_optimizer.step()
예제 #12
0
    def forward(self, step, input, h_0, seq_length):
        self.encoder_gru.flatten_parameters()

        input_size = input.size()

        outs = []
        for idx in range(self.embeddings):
            # outs.append(emb.apply(input_[:,:,idx:(idx+1)]))
            emb = getattr(self, 'embedding'+str(idx))
            word_slice = input[:, :, idx]
            word_emb = emb(word_slice)
            outs.append(word_emb)

        curr_end = self.embeddings
        if self._use_gaz:
            gazStart = curr_end
            gazEnd = gazStart + self.gazsize
            if config['USE_CUDA']:
                outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.cuda.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous())
            else:
                outs.append(self.gemb(input[:, :, gazStart:gazEnd].type(torch.FloatTensor).view(-1, self.gazsize)).view(input_size[0], input_size[1], -1).contiguous())
            curr_end = gazEnd

        if self._use_char_conv:
            chars = input[:, :, curr_end:curr_end + self.char_len].contiguous()
            chars_mask = input[:, :, (curr_end + self.char_len):(curr_end + 2 * self.char_len)]
            if config['USE_CUDA']:
                chars_mask = chars_mask.type(torch.cuda.FloatTensor)
            else:
                chars_mask = chars_mask.type(torch.FloatTensor)
            chars_size = chars.size()
            char_view = chars.view(-1, self.char_len)
            char_emb_out = self.char_emb(char_view)


            # char_shape = char_emb_out.shape
            # char_emb_out = char_emb_out.reshape((char_shape[0] * char_shape[1], char_shape[2], 1, char_shape[3]))
            # char_conv_out = self.char_conv.apply(char_emb_out)
            # char_conv_out = self.conv_active.apply(char_conv_out)
            # char_conv_out = char_conv_out.reshape(char_shape)
            # char_conv_out = char_conv_out * chars_mask.dimshuffle(0, 1, 2, 'x')
            # char_conv_out = tensor.max(char_conv_out, axis=2)

            char_emb_out = char_emb_out.transpose(1, 2)
            char_conv_out = self.char_conv(char_emb_out)
            char_conv_out = self.conv_active(char_conv_out)
            char_conv_out = char_conv_out.transpose(1, 2)
            chars_mask = chars_mask.view(-1, self.char_len)
            char_conv_out = char_conv_out * chars_mask.unsqueeze(2).expand_as(char_conv_out)
            char_conv_out, _ = torch.max(char_conv_out, 1)
            char_conv_out = char_conv_out.view(chars_size[0], chars_size[1], -1)
            outs.append(char_conv_out)


        output = torch.cat(outs, dim=-1)
        mask = Variable(get_source_mask(input_size[0], self.out_dim, input_size[1], seq_length))
        if config['USE_CUDA']:
            mask = mask.cuda(config['cuda_num'])
        if config['use_multi']:
            mask = mask.cuda(input.get_device())
        mask = mask.transpose(0, 1)
        embedded = output * mask
        embedded = self.dropout(embedded)
        # logger.histo_summary('embedded', to_np(embedded), step)


        # embedded = self.embedding0(input)  # embedded: (batch, seq_length, emb_size)
        output = embedded.transpose(1, 2)  # embedded: (batch, emb_size, seq_length)
        for i in range(3):
            conv = getattr(self, 'conv'+str(i))
            output = self.relu(conv(output))  # output: (batch, encoder_filter_num, seq_length)
            output = self.dropout(output)

        output = output.transpose(1, 2)
        output = output.transpose(0, 1).contiguous()  # output: (seq_length, batch, encoder_filter_num)
        # return self.dropout(output)

        gru_output, h_n = self.encoder_gru(output, h_0)
        return self.dropout(gru_output)  # (seq_len, batch, hidden_size * num_directions)
예제 #13
0
def evaluate_one(step,
                 embedding_layer,
                 q_word_embedding,
                 q_emb_layer,
                 att_layer,
                 model_layer,
                 ner_hw_layer,
                 ner_out_layer,
                 crf,
                 this_batch,
                 summary_emb=False,
                 all_emb=None,
                 all_metadata=None):

    d = config['hidden_size']
    this_batch_num = len(this_batch[2])
    question = Variable(this_batch[4])
    question_lengths = this_batch[5]
    context = Variable(this_batch[0], volatile=True)  # (batch, T, 51)
    context_lengths = this_batch[3]  # list
    target = Variable(this_batch[1], volatile=True)  # (batch, T)
    emb_h_0 = Variable(torch.zeros(2, this_batch_num, d), volatile=True)
    model_out_h_0 = Variable(torch.zeros(2 * model_layer.num_layers,
                                         this_batch_num, d),
                             volatile=True)
    con_lens_var = Variable(torch.LongTensor(context_lengths), volatile=True)

    if config['USE_CUDA']:
        question = question.cuda(config['cuda_num'])
        context = context.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        emb_h_0 = emb_h_0.cuda(config['cuda_num'])
        model_out_h_0 = model_out_h_0.cuda(config['cuda_num'])
        con_lens_var = con_lens_var.cuda(config['cuda_num'])

    c_emb = embedding_layer(context, emb_h_0, context_lengths, step, name='C')
    if config['question_alone']:
        q_emb = q_emb_layer(question,
                            emb_h_0,
                            question_lengths,
                            step,
                            name='Q')
    else:
        q_emb = embedding_layer(question, emb_h_0, question_lengths, step,
                                q_word_embedding, 'Q')

    if summary_emb:
        for i in range(this_batch_num):
            sentence = ''
            metadata = []
            for tokenId, token in enumerate(context[i]):
                if tokenId >= context_lengths[i]:
                    break
                word = config['WordId'].getWord(token[0].data.cpu().numpy()[0])
                metadata.append(word)
                if word != '%PADDING%':
                    sentence += ' ' + word
            if step == 0 and i == 0:
                all_emb = c_emb.data.cpu()[:context_lengths[i], i, :]
            else:
                all_emb = torch.cat(
                    [all_emb,
                     c_emb.data.cpu()[:context_lengths[i], i, :]], 0)
            metadata = ['_'.join([word, sentence]) for word in metadata]
            all_metadata.extend(metadata)

    G = att_layer(c_emb, q_emb, context_lengths, question_lengths)
    M = model_layer(model_out_h_0, G, context_lengths, step)
    if config['not_pretrain']:
        M_trans = M
        G_trans = G
    else:
        M_trans, G_trans = ner_hw_layer(M, G)
    # M_trans, G_trans = ner_hw_layer(M, G)
    prob = ner_out_layer(M_trans, G_trans, context_lengths)
    # prob = ner_out_layer(M, G, context_lengths)
    prob_size = prob.size()
    mask = Variable(
        get_source_mask(prob_size[0], prob_size[2], prob_size[1],
                        context_lengths))
    mask = mask.transpose(0, 1)
    if config['USE_CUDA']:
        mask = mask.cuda(context.get_device())
    prob = prob * mask
    crf_mask = Variable(
        get_target_mask(this_batch_num, max(context_lengths), context_lengths))
    if config['USE_CUDA']:
        crf_mask = crf_mask.type(torch.cuda.ByteTensor)
        crf_mask = crf_mask.cuda(config['cuda_num'])
    else:
        crf_mask = crf_mask.type(torch.ByteTensor)
    lst_decode = crf(
        prob.transpose(0, 1).contiguous(), crf_mask, context_lengths)
    # value, rec_label = torch.max(prob.data, 2)
    if summary_emb:
        return lst_decode, all_emb, all_metadata, q_emb[:, 0, :].data.cpu()
    else:
        return lst_decode
예제 #14
0
def eva_one_sentence_vib(encoder, decoder, this_batch):
    this_batch_num = len(this_batch[3])
    this_batch_max_seq = max(this_batch[3])
    last_hidden = Variable(
        torch.zeros(config['decoder_layers'], this_batch_num,
                    config['hidden_size']))
    word_input = Variable(
        torch.zeros(this_batch_num, 1).type(torch.LongTensor))

    data = Variable(this_batch[0], volatile=True)
    target = Variable(this_batch[1], volatile=True)
    length = Variable(torch.LongTensor(this_batch[3]), volatile=True)
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size'] / 2),
                   volatile=True)  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        length = length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])

    encoder_outputs = encoder(0, data, h_0, this_batch[3])
    # encoder_outputs = encoder_outputs.transpose(1, 2)
    # encoder_outputs = encoder_outputs.transpose(0, 1)
    source_mask = Variable(
        get_source_mask(this_batch_num, config['encoder_filter_num'],
                        max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask

    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['encoder_outputs_size']))
    # last_hidden = Variable(torch.randn(config['decoder_layers'], config['batch_size'], config['hidden_size']))
    # decoder = BahdanauAttnDecoderRNN(config, config['encoder_outputs_size'], config['hidden_size'], config['decoder_output_size'], config['decoder_layers'])
    # decoder.load_state_dict(torch.load('net_params.pkl'))
    # optimizer = torch.optim.Adadelta(decoder.parameters())
    # word_input = Variable(torch.LongTensor([[0], [1]]))
    # if config['USE_CUDA']:
    #     encoder_outputs = encoder_outputs.cuda(config['cuda_num'])
    #     last_hidden = last_hidden.cuda(config['cuda_num'])
    #     word_input = word_input.cuda(config['cuda_num'])
    #     decoder.cuda(config['cuda_num'])

    # evaluate
    beam = [{
        'paths': [[], []],
        'prob':
        Variable(torch.zeros(this_batch_num, 1)),
        'hidden':
        Variable(
            torch.randn(config['decoder_layers'], this_batch_num,
                        config['hidden_size']))
    }, {}]  # beam_size*batch_size*([path],hidden)
    beam = []
    tag_size = 18
    for beam_i in range(tag_size):
        prob_init = Variable(torch.zeros(this_batch_num, 1))
        hidden_init = Variable(
            torch.zeros(config['decoder_layers'], this_batch_num,
                        config['hidden_size']))
        if config['USE_CUDA']:
            prob_init = prob_init.cuda(config['cuda_num'])
            hidden_init = hidden_init.cuda(config['cuda_num'])
        one_beam = {'paths': [], 'prob': prob_init, 'hidden': hidden_init}
        for batch_i in range(this_batch_num):
            one_beam['paths'].append([0])
        beam.append(one_beam)

    # beam = [{'paths':[] , 'tails':range(output_size) },{'paths':[] , 'tails':range(output_size)}]

    # print beam
    for time_step in range(this_batch_max_seq * 3):
        # label_prob: (B, output_size)  cur_hidden: (num_layers * num_directions, B, hidden)  att_weights: (B, 1, S)\

        next_prob = []
        cur_hidden_lst = []
        for i, beam_i in enumerate(beam):
            word_input = Variable(torch.LongTensor(this_batch_num, 1).zero_())
            for batch_i in range(len(beam_i['paths'])):
                word_input[batch_i, 0] = beam_i['paths'][batch_i][-1]
            last_hidden = beam_i['hidden']
            if config['USE_CUDA']:
                word_input = word_input.cuda(config['cuda_num'])
                last_hidden = last_hidden.cuda(config['cuda_num'])
            # word_input: (batch, 1)  last_hidden: (layers * directions, batch, hidden) encoder_outputs: (S, B, hidden)
            # label_prob: (B, output_size)  cur_hidden: (num_layers * num_directions, B, hidden)  att_weights: (B, 1, S)
            label_prob, cur_hidden, attn_weights = decoder(
                0, word_input, last_hidden, encoder_outputs)
            cur_hidden_lst.append(cur_hidden)
            log_label_prob = F.log_softmax(label_prob)
            next_prob.append(beam_i['prob'].expand_as(log_label_prob) +
                             log_label_prob)  # (batch_size, output_size)
        # cat = torch.cat(next_prob, 1)  # (batch, outputs_size*beam_size)
        batch_best_indices = []
        for batch_i in range(this_batch_num):
            cat = [prob[batch_i, :].unsqueeze(0) for prob in next_prob]
            cat = torch.cat(cat, 0)
            values, indices = cat.topk(1, 0)  # indices: (1, tag_size)
            batch_best_indices.append(indices.data)

        new_beam = []
        for beam_i in range(tag_size):
            prob_init = Variable(torch.zeros(this_batch_num, 1))
            hidden_init = Variable(
                torch.randn(config['decoder_layers'], this_batch_num,
                            config['hidden_size']))
            if config['USE_CUDA']:
                prob_init = prob_init.cuda(config['cuda_num'])
                hidden_init = hidden_init.cuda(config['cuda_num'])
            one_beam = {'paths': [], 'prob': prob_init, 'hidden': hidden_init}
            new_beam.append(one_beam)
        for i_batch in range(this_batch_num):
            for i_beam in range(tag_size):
                this_beam_num = batch_best_indices[i_batch][0, i_beam]
                a = beam[this_beam_num]['paths'][i_batch][:]
                a.append(i_beam)
                new_beam[i_beam]['paths'].append(a)
                new_beam[i_beam]['hidden'] = cur_hidden_lst[this_beam_num]
                new_beam[i_beam]['prob'][i_batch,
                                         0] = next_prob[this_beam_num][i_batch,
                                                                       i_beam]

        beam = new_beam
        batch_best_path = []
        for i_batch in range(this_batch_num):
            this_batch_best_path = beam[0]['paths'][i_batch]
            this_batch_best_prob = beam[0]['prob'][i_batch, 0]
            for i_beam in range(tag_size):
                if beam[i_beam]['prob'][i_batch, 0] > this_batch_best_prob:
                    this_batch_best_path = beam[i_beam]['paths'][i_batch]
            batch_best_path.append(this_batch_best_path)

        top_path = batch_best_path[0]
        if top_path.count(
                config['X']
        ) == this_batch_max_seq and top_path[-1] == config['EOS_token']:
            break

        if top_path.count(config['X']) > this_batch_max_seq:
            top_path[-1] = config['EOS_token']
            break

    # print beam[0]['paths']  # (output_size, B, 1)
    return top_path
예제 #15
0
def train_iteration(logger, config, my_arg, step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_target = max(this_batch[2])
    last_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size']))
    word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor))
    print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target  # (output_size, B, 1)

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    target_length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2))  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        target_length = target_length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])

    encoder_outputs = encoder(step, data, h_0, this_batch[3])
    # encoder_outputs = encoder_outputs.transpose(1,2)
    # encoder_outputs = encoder_outputs.transpose(0,1)
    source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask
    seq_label_prob = decoder(last_hidden, encoder_outputs, this_batch[3])

    loss = masked_cross_entropy(seq_label_prob.transpose(0,1).contiguous(), target, target_length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]

    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    # for tag, value in encoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in decoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    decoder_optimizer.step()
    encoder_optimizer.step()
    e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]
    for before, after in zip(e_before_step, e_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)

    for before, after in zip(d_before_step, d_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)