def train_iteration(logger, step, embedding_layer, att_layer, model_layer,
                    start_layer, end_layer, emb_opt, att_opt, model_opt,
                    start_opt, end_opt, this_batch):
    emb_opt.zero_grad()
    att_opt.zero_grad()
    model_opt.zero_grad()
    start_opt.zero_grad()
    end_opt.zero_grad()

    d = config['hidden_size']
    this_batch_num = len(this_batch['con_lens'])
    question = Variable(this_batch['questions'])
    question_lengths = this_batch['q_lens']
    context = Variable(this_batch['contexts'])  # (batch, T, 51)
    context_lengths = this_batch['con_lens']  # list

    start_target = Variable(this_batch['start'])
    end_target = Variable(this_batch['end'])
    emb_h_0 = Variable(torch.zeros(2, this_batch_num, d))
    model_h_0 = Variable(
        torch.zeros(2 * model_layer.num_layers, this_batch_num, d))
    end_h_0 = Variable(torch.zeros(2, this_batch_num, d))

    if config['USE_CUDA']:
        question = question.cuda(config['cuda_num'])
        context = context.cuda(config['cuda_num'])
        emb_h_0 = emb_h_0.cuda(config['cuda_num'])
        model_h_0 = model_h_0.cuda(config['cuda_num'])
        end_h_0 = end_h_0.cuda(config['cuda_num'])
        start_target = start_target.cuda(config['cuda_num'])
        end_target = end_target.cuda(config['cuda_num'])

    c_emb = embedding_layer(
        context, emb_h_0, context_lengths, step,
        'C')  # (seq_len, batch, hidden_size(d=100) * num_directions(2))
    q_emb = embedding_layer(
        question, emb_h_0, question_lengths, step,
        'Q')  # (seq_len, batch, hidden_size(d=100) * num_directions(2))
    G = att_layer(c_emb, q_emb, context_lengths, question_lengths,
                  step)  # (batch, T, 8d)
    M = model_layer(model_h_0, G, context_lengths, step)  # M: (batch, T, 2d)
    start_logits = start_layer(M, G, context_lengths)  # (batch, T)
    end_logits = end_layer(M, G, end_h_0, context_lengths)  # (batch, T)
    loss = -torch.sum(start_logits * start_target +
                      end_logits * end_target) / this_batch_num
    print('loss: ', loss.data[0])
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    # e_before_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()]
    # a_before_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()]
    # m_before_step = [(tag, to_np(value)) for tag, value in model_layer.named_parameters()]
    # start_before_step = [(tag, to_np(value)) for tag, value in start_layer.named_parameters()]
    # end_before_step = [(tag, to_np(value)) for tag, value in end_layer.named_parameters()]

    clip_grad_norm(embedding_layer.parameters(), config['clip_norm'])
    clip_grad_norm(att_layer.parameters(), config['clip_norm'])
    clip_grad_norm(model_layer.parameters(), config['clip_norm'])
    clip_grad_norm(start_layer.parameters(), config['clip_norm'])
    clip_grad_norm(end_layer.parameters(), config['clip_norm'])
    for tag, value in embedding_layer.named_parameters():
        tag = tag.replace('.', '/')
        if value is not None and value.grad is not None:
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    for tag, value in att_layer.named_parameters():
        tag = tag.replace('.', '/')
        if value is not None and value.grad is not None:
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    for tag, value in model_layer.named_parameters():
        tag = tag.replace('.', '/')
        if value is not None and value.grad is not None:
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    for tag, value in start_layer.named_parameters():
        tag = tag.replace('.', '/')
        if value is not None and value.grad is not None:
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    for tag, value in end_layer.named_parameters():
        tag = tag.replace('.', '/')
        if value is not None and value.grad is not None:
            logger.histo_summary(tag, to_np(value), step)
            logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    emb_opt.step()
    att_opt.step()
    model_opt.step()
    start_opt.step()
    end_opt.step()
Exemple #2
0
def train_iteration(logger, config, my_arg, step,
                    encoder, bidencoder, decoder, encoder_optimizer, bidencoder_optimizer, decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_target = max(this_batch[2])
    last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size']))
    bid_init_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size']))
    word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor))
    print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target  # (output_size, B, 1)

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    target_length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2))  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        target_length = target_length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])
        bid_init_hidden = bid_init_hidden.cuda(config['cuda_num'])

    encoder_outputs = encoder(step, data, h_0, this_batch[3])
    source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask
    encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3])

    seq_label_prob = Variable(torch.zeros(this_batch_max_target, this_batch_num, config['decoder_output_size']))
    if config['USE_CUDA']:
        seq_label_prob = seq_label_prob.cuda(config['cuda_num'])

    rate = schedule_samp_rate(step)
    # rate=0
    for time_step in range(this_batch_max_target):
        label_logits, cur_hidden = decoder(step, word_input, last_hidden, encoder_outputs[time_step])
        last_hidden = cur_hidden
        seq_label_prob[time_step] = label_logits
        # Choose top word from label_prob
        # value, label = label_prob.topk(1)
        # decoder_out_label.append(label)
        # not teacher-forcing
        # word_input = label

        # teacher-forcing
        if my_arg == 0:
            word_input = target[:, time_step]
        else:
            # value, label = label_logits.data.topk(1)
            # decoder_out_label.append(label)
            # word_input = Variable(label)  # Chosen word is next input
            # if config['USE_CUDA']:
            #     word_input = word_input.cuda(config['cuda_num'])
            a = random_pick([0, 1], [rate, 1 - rate])
            if a == 0:
                word_input = target[:, time_step]
            else:
                value, label = label_logits.data.topk(1)
                # decoder_out_label.append(label)
                word_input = Variable(label)  # Chosen word is next input
                if config['USE_CUDA']:
                    word_input = word_input.cuda(config['cuda_num'])

    loss = masked_cross_entropy(seq_label_prob.transpose(0, 1).contiguous(), target, target_length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    b_before_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()]
    d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]

    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    decoder_optimizer.step()
    encoder_optimizer.step()
    bidencoder_optimizer.step()
    e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    b_after_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()]
    d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]
    for before, after in zip(e_before_step, e_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
    for before, after in zip(b_before_step, b_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)

    for before, after in zip(d_before_step, d_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
Exemple #3
0
def train_iteration(logger, step, embedding_layer, ctx_lstm, ctx_att,
                    sigmoid_loss, ctx_lstm_opt, ctx_att_opt, sig_opt,
                    this_batch):

    l_ctx = Variable(this_batch['l_ctx_tensor'])
    mentions = Variable(this_batch['mentions_tensor'])
    r_ctx = Variable(this_batch['r_ctx_tensor'])
    types = Variable(this_batch['types_tensor'])
    labels = Variable(this_batch['labels_tensor'])
    if fg_config['USE_CUDA']:
        l_ctx = l_ctx.cuda(fg_config['cuda_num'])
        mentions = mentions.cuda(fg_config['cuda_num'])
        r_ctx = r_ctx.cuda(fg_config['cuda_num'])
        types = types.cuda(fg_config['cuda_num'])
        labels = labels.cuda(fg_config['cuda_num'])
    l_ctx_lens = this_batch['l_ctx_lens']
    r_ctx_lens = this_batch['r_ctx_lens']
    men_lens = this_batch['men_lens']

    l_ctx_emb = embedding_layer(l_ctx)  # (B, S, word_emb)
    mentions_emb = embedding_layer(mentions)  # (B, S, word_emb)
    r_ctx_emb = embedding_layer(r_ctx)  # (B, S, word_emb)
    types_emb = embedding_layer(types)  # (B, word_emb)
    l_ctx_lstm, r_ctx_lstm = ctx_lstm(l_ctx_emb, r_ctx_emb, l_ctx_lens,
                                      r_ctx_lens)
    ctx_rep, men_rep = ctx_att(l_ctx_lstm, r_ctx_lstm, types_emb, mentions_emb,
                               l_ctx_lens, r_ctx_lens, men_lens)
    loss, _ = sigmoid_loss(ctx_rep, men_rep, labels, types_emb)
    if step % 100 == 0:
        print('loss: ', loss.data[0])
    logger.scalar_summary('loss', loss.data[0], step)

    loss.backward()
    ctx_lstm_before_step = [(tag, to_np(value))
                            for tag, value in ctx_lstm.named_parameters()]
    ctx_att_before_step = [(tag, to_np(value))
                           for tag, value in ctx_att.named_parameters()]
    sig_before_step = [(tag, to_np(value))
                       for tag, value in sigmoid_loss.named_parameters()]

    # clip_grad_norm(embedding_layer.parameters(), fg_config['clip_norm'])
    clip_grad_norm(ctx_lstm.parameters(), fg_config['clip_norm'])
    clip_grad_norm(ctx_att.parameters(), fg_config['clip_norm'])
    clip_grad_norm(sigmoid_loss.parameters(), fg_config['clip_norm'])
    # for tag, value in embedding_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in att_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in model_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)

    # for tag, value in ner_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    #
    # for tag, value in crf.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    ctx_lstm_opt.step()
    ctx_att_opt.step()
    sig_opt.step()

    grad_ratio_lst = []
    ctx_lstm_after_step = [(tag, to_np(value))
                           for tag, value in ctx_lstm.named_parameters()]
    grad_ratio_lst.append((ctx_lstm_before_step, ctx_lstm_after_step))
    ctx_att_after_step = [(tag, to_np(value))
                          for tag, value in ctx_att.named_parameters()]
    grad_ratio_lst.append((ctx_att_before_step, ctx_att_after_step))
    sig_after_step = [(tag, to_np(value))
                      for tag, value in sigmoid_loss.named_parameters()]
    grad_ratio_lst.append((sig_before_step, sig_after_step))

    utils.log_grad_ratio(logger, step, grad_ratio_lst)
def train_iteration(logger, step, embedding_layer, att_layer, model_out_layer,
                    emb_opt, att_opt, model_out_opt, this_batch):
    emb_opt.zero_grad()
    att_opt.zero_grad()
    model_out_opt.zero_grad()

    d = embedding_layer.out_dim
    this_batch_num = len(this_batch[2])

    # question = Variable(get_question('%GPE%', this_batch_num))  # (batch, J=1, 51)
    question = Variable(this_batch[4])
    # question_lengths = [1 for _ in range(this_batch_num)]
    question_lengths = this_batch[5]
    context = Variable(this_batch[0])  # (batch, T, 51)
    context_lengths = this_batch[3]  # list
    target = Variable(this_batch[1])  # (batch, T)
    emb_h_0 = Variable(torch.zeros(2, this_batch_num, d))
    model_out_h_0 = Variable(
        torch.zeros(2 * model_out_layer.num_layers, this_batch_num, d))
    con_lens_var = Variable(torch.LongTensor(context_lengths))

    if config['USE_CUDA']:
        question = question.cuda(config['cuda_num'])
        context = context.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        emb_h_0 = emb_h_0.cuda(config['cuda_num'])
        model_out_h_0 = model_out_h_0.cuda(config['cuda_num'])
        con_lens_var = con_lens_var.cuda(config['cuda_num'])

    c_emb = embedding_layer(context, emb_h_0, context_lengths, step, 'C')
    q_emb = embedding_layer(question, emb_h_0, question_lengths, step, 'Q')
    G = att_layer(c_emb, q_emb, context_lengths, question_lengths, step)
    prob = model_out_layer(model_out_h_0, G, context_lengths, step)
    loss = masked_cross_entropy(prob, target, con_lens_var)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    e_before_step = [(tag, to_np(value))
                     for tag, value in embedding_layer.named_parameters()]
    a_before_step = [(tag, to_np(value))
                     for tag, value in att_layer.named_parameters()]
    m_before_step = [(tag, to_np(value))
                     for tag, value in model_out_layer.named_parameters()]

    clip_grad_norm(embedding_layer.parameters(), config['clip_norm'])
    clip_grad_norm(att_layer.parameters(), config['clip_norm'])
    clip_grad_norm(model_out_layer.parameters(), config['clip_norm'])
    # for tag, value in embedding_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in att_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in model_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    emb_opt.step()
    att_opt.step()
    model_out_opt.step()

    e_after_step = [(tag, to_np(value))
                    for tag, value in embedding_layer.named_parameters()]
    a_after_step = [(tag, to_np(value))
                    for tag, value in att_layer.named_parameters()]
    m_after_step = [(tag, to_np(value))
                    for tag, value in model_out_layer.named_parameters()]

    for before, after in zip(e_before_step, e_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)

    for before, after in zip(a_before_step, a_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
    for before, after in zip(m_before_step, m_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(logger, config, my_arg, step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch):
    # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size']))
    decoder_optimizer.zero_grad()
    encoder_optimizer.zero_grad()
    this_batch_num = len(this_batch[2])
    this_batch_max_target = max(this_batch[2])
    last_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size']))
    word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor))
    print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target  # (output_size, B, 1)

    data = Variable(this_batch[0])
    target = Variable(this_batch[1])
    target_length = Variable(torch.LongTensor(this_batch[2]))
    h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2))  # encoder gru initial hidden state

    if config['USE_CUDA']:
        last_hidden = last_hidden.cuda(config['cuda_num'])
        word_input = word_input.cuda(config['cuda_num'])
        data = data.cuda(config['cuda_num'])
        target = target.cuda(config['cuda_num'])
        target_length = target_length.cuda(config['cuda_num'])
        h_0 = h_0.cuda(config['cuda_num'])

    encoder_outputs = encoder(step, data, h_0, this_batch[3])
    # encoder_outputs = encoder_outputs.transpose(1,2)
    # encoder_outputs = encoder_outputs.transpose(0,1)
    source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3]))
    if config['USE_CUDA']:
        source_mask = source_mask.cuda(config['cuda_num'])
    encoder_outputs = encoder_outputs * source_mask
    seq_label_prob = decoder(last_hidden, encoder_outputs, this_batch[3])

    loss = masked_cross_entropy(seq_label_prob.transpose(0,1).contiguous(), target, target_length)
    # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length)
    print 'loss: ', loss.data[0]
    logger.scalar_summary('loss', loss.data[0], step)
    loss.backward()
    e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]

    clip_grad_norm(decoder.parameters(), config['clip_norm'])
    clip_grad_norm(encoder.parameters(), config['clip_norm'])
    # for tag, value in encoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in decoder.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    decoder_optimizer.step()
    encoder_optimizer.step()
    e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()]
    d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()]
    for before, after in zip(e_before_step, e_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)

    for before, after in zip(d_before_step, d_after_step):
        if before[0] == after[0]:
            tag = before[0]
            value = LA.norm(after[1] - before[1]) / LA.norm(before[1])
            tag = tag.replace('.', '/')
            if value is not None:
                logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(logger, step, word_embedding_layer, type_embedding_layer, ctx_lstm, ctx_att, warp_loss,
                    ctx_lstm_opt, ctx_att_opt, sig_opt, this_batch):
    # if step == 398:
    #     pydevd.settrace('10.214.129.230', port=31235, stdoutToServer=True, stderrToServer=True)

    l_ctx = Variable(this_batch['l_ctx_tensor'])
    mentions = Variable(this_batch['mentions_tensor'])
    r_ctx = Variable(this_batch['r_ctx_tensor'])
    types = Variable(this_batch['types_tensor'])
    labels = Variable(this_batch['labels_tensor'])
    if fg_config['USE_CUDA']:
        l_ctx = l_ctx.cuda(fg_config['cuda_num'])
        mentions = mentions.cuda(fg_config['cuda_num'])
        r_ctx = r_ctx.cuda(fg_config['cuda_num'])
        types = types.cuda(fg_config['cuda_num'])
        labels = labels.cuda(fg_config['cuda_num'])
    l_ctx_lens = this_batch['l_ctx_lens']
    r_ctx_lens = this_batch['r_ctx_lens']
    men_lens = this_batch['men_lens']


    l_ctx_emb = word_embedding_layer(l_ctx)  # (B, S, word_emb)
    mentions_emb = word_embedding_layer(mentions)  # (B, S, word_emb)
    r_ctx_emb = word_embedding_layer(r_ctx)  # (B, S, word_emb)
    types_emb = type_embedding_layer(types)  # (89, word_emb)
    l_ctx_lstm, r_ctx_lstm = ctx_lstm(l_ctx_emb, r_ctx_emb, l_ctx_lens, r_ctx_lens)
    ctx_rep, men_rep = ctx_att(l_ctx_lstm, r_ctx_lstm, mentions_emb, l_ctx_lens, r_ctx_lens, men_lens, types_emb)

    loss = warp_loss(ctx_rep, men_rep, labels, types_emb)
    if step > 100:
        print('loss: ', loss.data[0])
        logger.scalar_summary('loss', loss.data[0], step)


    loss.backward()

    ctx_lstm_before_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()]
    ctx_att_before_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()]
    sig_before_step = [(tag, to_np(value)) for tag, value in warp_loss.named_parameters()]




    # clip_grad_norm(embedding_layer.parameters(), fg_config['clip_norm'])
    clip_grad_norm(ctx_lstm.parameters(), fg_config['clip_norm'])
    clip_grad_norm(ctx_att.parameters(), fg_config['clip_norm'])
    clip_grad_norm(warp_loss.parameters(), fg_config['clip_norm'])
    # for tag, value in embedding_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in att_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    # for tag, value in model_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)

    # for tag, value in ner_out_layer.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    #
    # for tag, value in crf.named_parameters():
    #     tag = tag.replace('.', '/')
    #     if value is not None and value.grad is not None:
    #         logger.histo_summary(tag, to_np(value), step)
    #         logger.histo_summary(tag + '/grad', to_np(value.grad), step)
    ctx_lstm_opt.step()
    ctx_att_opt.step()
    sig_opt.step()

    grad_ratio_lst = []

    ctx_lstm_after_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()]
    grad_ratio_lst.append((ctx_lstm_before_step, ctx_lstm_after_step))
    ctx_att_after_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()]
    grad_ratio_lst.append((ctx_att_before_step, ctx_att_after_step))
    sig_after_step = [(tag, to_np(value)) for tag, value in warp_loss.named_parameters()]
    grad_ratio_lst.append((sig_before_step, sig_after_step))
    # h_after_step = [(tag, to_np(value)) for tag, value in ner_hw_layer.named_parameters()]
    # grad_ratio_lst.append((h_before_step, h_after_step))
    # n_after_step = [(tag, to_np(value)) for tag, value in ner_out_layer.named_parameters()]
    # grad_ratio_lst.append((n_before_step, n_after_step))
    # c_after_step = [(tag, to_np(value)) for tag, value in crf.named_parameters()]
    # grad_ratio_lst.append((c_before_step, c_after_step))
    # q_after_step = [(tag, to_np(value)) for tag, value in q_emb_layer.named_parameters()]
    #
    utils.log_grad_ratio(logger, step, grad_ratio_lst)