def train_iteration(logger, step, embedding_layer, att_layer, model_layer, start_layer, end_layer, emb_opt, att_opt, model_opt, start_opt, end_opt, this_batch): emb_opt.zero_grad() att_opt.zero_grad() model_opt.zero_grad() start_opt.zero_grad() end_opt.zero_grad() d = config['hidden_size'] this_batch_num = len(this_batch['con_lens']) question = Variable(this_batch['questions']) question_lengths = this_batch['q_lens'] context = Variable(this_batch['contexts']) # (batch, T, 51) context_lengths = this_batch['con_lens'] # list start_target = Variable(this_batch['start']) end_target = Variable(this_batch['end']) emb_h_0 = Variable(torch.zeros(2, this_batch_num, d)) model_h_0 = Variable( torch.zeros(2 * model_layer.num_layers, this_batch_num, d)) end_h_0 = Variable(torch.zeros(2, this_batch_num, d)) if config['USE_CUDA']: question = question.cuda(config['cuda_num']) context = context.cuda(config['cuda_num']) emb_h_0 = emb_h_0.cuda(config['cuda_num']) model_h_0 = model_h_0.cuda(config['cuda_num']) end_h_0 = end_h_0.cuda(config['cuda_num']) start_target = start_target.cuda(config['cuda_num']) end_target = end_target.cuda(config['cuda_num']) c_emb = embedding_layer( context, emb_h_0, context_lengths, step, 'C') # (seq_len, batch, hidden_size(d=100) * num_directions(2)) q_emb = embedding_layer( question, emb_h_0, question_lengths, step, 'Q') # (seq_len, batch, hidden_size(d=100) * num_directions(2)) G = att_layer(c_emb, q_emb, context_lengths, question_lengths, step) # (batch, T, 8d) M = model_layer(model_h_0, G, context_lengths, step) # M: (batch, T, 2d) start_logits = start_layer(M, G, context_lengths) # (batch, T) end_logits = end_layer(M, G, end_h_0, context_lengths) # (batch, T) loss = -torch.sum(start_logits * start_target + end_logits * end_target) / this_batch_num print('loss: ', loss.data[0]) logger.scalar_summary('loss', loss.data[0], step) loss.backward() # e_before_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()] # a_before_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()] # m_before_step = [(tag, to_np(value)) for tag, value in model_layer.named_parameters()] # start_before_step = [(tag, to_np(value)) for tag, value in start_layer.named_parameters()] # end_before_step = [(tag, to_np(value)) for tag, value in end_layer.named_parameters()] clip_grad_norm(embedding_layer.parameters(), config['clip_norm']) clip_grad_norm(att_layer.parameters(), config['clip_norm']) clip_grad_norm(model_layer.parameters(), config['clip_norm']) clip_grad_norm(start_layer.parameters(), config['clip_norm']) clip_grad_norm(end_layer.parameters(), config['clip_norm']) for tag, value in embedding_layer.named_parameters(): tag = tag.replace('.', '/') if value is not None and value.grad is not None: logger.histo_summary(tag, to_np(value), step) logger.histo_summary(tag + '/grad', to_np(value.grad), step) for tag, value in att_layer.named_parameters(): tag = tag.replace('.', '/') if value is not None and value.grad is not None: logger.histo_summary(tag, to_np(value), step) logger.histo_summary(tag + '/grad', to_np(value.grad), step) for tag, value in model_layer.named_parameters(): tag = tag.replace('.', '/') if value is not None and value.grad is not None: logger.histo_summary(tag, to_np(value), step) logger.histo_summary(tag + '/grad', to_np(value.grad), step) for tag, value in start_layer.named_parameters(): tag = tag.replace('.', '/') if value is not None and value.grad is not None: logger.histo_summary(tag, to_np(value), step) logger.histo_summary(tag + '/grad', to_np(value.grad), step) for tag, value in end_layer.named_parameters(): tag = tag.replace('.', '/') if value is not None and value.grad is not None: logger.histo_summary(tag, to_np(value), step) logger.histo_summary(tag + '/grad', to_np(value.grad), step) emb_opt.step() att_opt.step() model_opt.step() start_opt.step() end_opt.step()
def train_iteration(logger, config, my_arg, step, encoder, bidencoder, decoder, encoder_optimizer, bidencoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_target = max(this_batch[2]) last_hidden = Variable(torch.zeros(1, this_batch_num, config['hidden_size'])) bid_init_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size'])) word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor)) print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target # (output_size, B, 1) data = Variable(this_batch[0]) target = Variable(this_batch[1]) target_length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2)) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) target_length = target_length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) bid_init_hidden = bid_init_hidden.cuda(config['cuda_num']) encoder_outputs = encoder(step, data, h_0, this_batch[3]) source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask encoder_outputs = bidencoder(bid_init_hidden, encoder_outputs, this_batch[3]) seq_label_prob = Variable(torch.zeros(this_batch_max_target, this_batch_num, config['decoder_output_size'])) if config['USE_CUDA']: seq_label_prob = seq_label_prob.cuda(config['cuda_num']) rate = schedule_samp_rate(step) # rate=0 for time_step in range(this_batch_max_target): label_logits, cur_hidden = decoder(step, word_input, last_hidden, encoder_outputs[time_step]) last_hidden = cur_hidden seq_label_prob[time_step] = label_logits # Choose top word from label_prob # value, label = label_prob.topk(1) # decoder_out_label.append(label) # not teacher-forcing # word_input = label # teacher-forcing if my_arg == 0: word_input = target[:, time_step] else: # value, label = label_logits.data.topk(1) # decoder_out_label.append(label) # word_input = Variable(label) # Chosen word is next input # if config['USE_CUDA']: # word_input = word_input.cuda(config['cuda_num']) a = random_pick([0, 1], [rate, 1 - rate]) if a == 0: word_input = target[:, time_step] else: value, label = label_logits.data.topk(1) # decoder_out_label.append(label) word_input = Variable(label) # Chosen word is next input if config['USE_CUDA']: word_input = word_input.cuda(config['cuda_num']) loss = masked_cross_entropy(seq_label_prob.transpose(0, 1).contiguous(), target, target_length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] b_before_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()] d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) decoder_optimizer.step() encoder_optimizer.step() bidencoder_optimizer.step() e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] b_after_step = [(tag, to_np(value)) for tag, value in bidencoder.named_parameters()] d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] for before, after in zip(e_before_step, e_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(b_before_step, b_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(d_before_step, d_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(logger, step, embedding_layer, ctx_lstm, ctx_att, sigmoid_loss, ctx_lstm_opt, ctx_att_opt, sig_opt, this_batch): l_ctx = Variable(this_batch['l_ctx_tensor']) mentions = Variable(this_batch['mentions_tensor']) r_ctx = Variable(this_batch['r_ctx_tensor']) types = Variable(this_batch['types_tensor']) labels = Variable(this_batch['labels_tensor']) if fg_config['USE_CUDA']: l_ctx = l_ctx.cuda(fg_config['cuda_num']) mentions = mentions.cuda(fg_config['cuda_num']) r_ctx = r_ctx.cuda(fg_config['cuda_num']) types = types.cuda(fg_config['cuda_num']) labels = labels.cuda(fg_config['cuda_num']) l_ctx_lens = this_batch['l_ctx_lens'] r_ctx_lens = this_batch['r_ctx_lens'] men_lens = this_batch['men_lens'] l_ctx_emb = embedding_layer(l_ctx) # (B, S, word_emb) mentions_emb = embedding_layer(mentions) # (B, S, word_emb) r_ctx_emb = embedding_layer(r_ctx) # (B, S, word_emb) types_emb = embedding_layer(types) # (B, word_emb) l_ctx_lstm, r_ctx_lstm = ctx_lstm(l_ctx_emb, r_ctx_emb, l_ctx_lens, r_ctx_lens) ctx_rep, men_rep = ctx_att(l_ctx_lstm, r_ctx_lstm, types_emb, mentions_emb, l_ctx_lens, r_ctx_lens, men_lens) loss, _ = sigmoid_loss(ctx_rep, men_rep, labels, types_emb) if step % 100 == 0: print('loss: ', loss.data[0]) logger.scalar_summary('loss', loss.data[0], step) loss.backward() ctx_lstm_before_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()] ctx_att_before_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()] sig_before_step = [(tag, to_np(value)) for tag, value in sigmoid_loss.named_parameters()] # clip_grad_norm(embedding_layer.parameters(), fg_config['clip_norm']) clip_grad_norm(ctx_lstm.parameters(), fg_config['clip_norm']) clip_grad_norm(ctx_att.parameters(), fg_config['clip_norm']) clip_grad_norm(sigmoid_loss.parameters(), fg_config['clip_norm']) # for tag, value in embedding_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in att_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in model_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in ner_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # # for tag, value in crf.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) ctx_lstm_opt.step() ctx_att_opt.step() sig_opt.step() grad_ratio_lst = [] ctx_lstm_after_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()] grad_ratio_lst.append((ctx_lstm_before_step, ctx_lstm_after_step)) ctx_att_after_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()] grad_ratio_lst.append((ctx_att_before_step, ctx_att_after_step)) sig_after_step = [(tag, to_np(value)) for tag, value in sigmoid_loss.named_parameters()] grad_ratio_lst.append((sig_before_step, sig_after_step)) utils.log_grad_ratio(logger, step, grad_ratio_lst)
def train_iteration(logger, step, embedding_layer, att_layer, model_out_layer, emb_opt, att_opt, model_out_opt, this_batch): emb_opt.zero_grad() att_opt.zero_grad() model_out_opt.zero_grad() d = embedding_layer.out_dim this_batch_num = len(this_batch[2]) # question = Variable(get_question('%GPE%', this_batch_num)) # (batch, J=1, 51) question = Variable(this_batch[4]) # question_lengths = [1 for _ in range(this_batch_num)] question_lengths = this_batch[5] context = Variable(this_batch[0]) # (batch, T, 51) context_lengths = this_batch[3] # list target = Variable(this_batch[1]) # (batch, T) emb_h_0 = Variable(torch.zeros(2, this_batch_num, d)) model_out_h_0 = Variable( torch.zeros(2 * model_out_layer.num_layers, this_batch_num, d)) con_lens_var = Variable(torch.LongTensor(context_lengths)) if config['USE_CUDA']: question = question.cuda(config['cuda_num']) context = context.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) emb_h_0 = emb_h_0.cuda(config['cuda_num']) model_out_h_0 = model_out_h_0.cuda(config['cuda_num']) con_lens_var = con_lens_var.cuda(config['cuda_num']) c_emb = embedding_layer(context, emb_h_0, context_lengths, step, 'C') q_emb = embedding_layer(question, emb_h_0, question_lengths, step, 'Q') G = att_layer(c_emb, q_emb, context_lengths, question_lengths, step) prob = model_out_layer(model_out_h_0, G, context_lengths, step) loss = masked_cross_entropy(prob, target, con_lens_var) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() e_before_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()] a_before_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()] m_before_step = [(tag, to_np(value)) for tag, value in model_out_layer.named_parameters()] clip_grad_norm(embedding_layer.parameters(), config['clip_norm']) clip_grad_norm(att_layer.parameters(), config['clip_norm']) clip_grad_norm(model_out_layer.parameters(), config['clip_norm']) # for tag, value in embedding_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in att_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in model_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) emb_opt.step() att_opt.step() model_out_opt.step() e_after_step = [(tag, to_np(value)) for tag, value in embedding_layer.named_parameters()] a_after_step = [(tag, to_np(value)) for tag, value in att_layer.named_parameters()] m_after_step = [(tag, to_np(value)) for tag, value in model_out_layer.named_parameters()] for before, after in zip(e_before_step, e_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(a_before_step, a_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(m_before_step, m_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(logger, config, my_arg, step, encoder, decoder, encoder_optimizer, decoder_optimizer, this_batch): # encoder_outputs = Variable(torch.randn(config['max_seq_length'], config['batch_size'], config['hidden_size'])) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() this_batch_num = len(this_batch[2]) this_batch_max_target = max(this_batch[2]) last_hidden = Variable(torch.zeros(config['decoder_layers']*2, this_batch_num, config['hidden_size'])) word_input = Variable(torch.zeros(this_batch_num, 1).type(torch.LongTensor)) print 'seq_length', max(this_batch[3]), 'label_length', this_batch_max_target # (output_size, B, 1) data = Variable(this_batch[0]) target = Variable(this_batch[1]) target_length = Variable(torch.LongTensor(this_batch[2])) h_0 = Variable(torch.zeros(2, this_batch_num, config['hidden_size']/2)) # encoder gru initial hidden state if config['USE_CUDA']: last_hidden = last_hidden.cuda(config['cuda_num']) word_input = word_input.cuda(config['cuda_num']) data = data.cuda(config['cuda_num']) target = target.cuda(config['cuda_num']) target_length = target_length.cuda(config['cuda_num']) h_0 = h_0.cuda(config['cuda_num']) encoder_outputs = encoder(step, data, h_0, this_batch[3]) # encoder_outputs = encoder_outputs.transpose(1,2) # encoder_outputs = encoder_outputs.transpose(0,1) source_mask = Variable(get_source_mask(this_batch_num, config['encoder_filter_num'], max(this_batch[3]), this_batch[3])) if config['USE_CUDA']: source_mask = source_mask.cuda(config['cuda_num']) encoder_outputs = encoder_outputs * source_mask seq_label_prob = decoder(last_hidden, encoder_outputs, this_batch[3]) loss = masked_cross_entropy(seq_label_prob.transpose(0,1).contiguous(), target, target_length) # loss = masked_cross_entropy(F.softmax(decoder_prob.transpose(0,1).contiguous()), target, length) print 'loss: ', loss.data[0] logger.scalar_summary('loss', loss.data[0], step) loss.backward() e_before_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] d_before_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] clip_grad_norm(decoder.parameters(), config['clip_norm']) clip_grad_norm(encoder.parameters(), config['clip_norm']) # for tag, value in encoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in decoder.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) decoder_optimizer.step() encoder_optimizer.step() e_after_step = [(tag, to_np(value)) for tag, value in encoder.named_parameters()] d_after_step = [(tag, to_np(value)) for tag, value in decoder.named_parameters()] for before, after in zip(e_before_step, e_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step) for before, after in zip(d_before_step, d_after_step): if before[0] == after[0]: tag = before[0] value = LA.norm(after[1] - before[1]) / LA.norm(before[1]) tag = tag.replace('.', '/') if value is not None: logger.scalar_summary(tag + '/grad_ratio', value, step)
def train_iteration(logger, step, word_embedding_layer, type_embedding_layer, ctx_lstm, ctx_att, warp_loss, ctx_lstm_opt, ctx_att_opt, sig_opt, this_batch): # if step == 398: # pydevd.settrace('10.214.129.230', port=31235, stdoutToServer=True, stderrToServer=True) l_ctx = Variable(this_batch['l_ctx_tensor']) mentions = Variable(this_batch['mentions_tensor']) r_ctx = Variable(this_batch['r_ctx_tensor']) types = Variable(this_batch['types_tensor']) labels = Variable(this_batch['labels_tensor']) if fg_config['USE_CUDA']: l_ctx = l_ctx.cuda(fg_config['cuda_num']) mentions = mentions.cuda(fg_config['cuda_num']) r_ctx = r_ctx.cuda(fg_config['cuda_num']) types = types.cuda(fg_config['cuda_num']) labels = labels.cuda(fg_config['cuda_num']) l_ctx_lens = this_batch['l_ctx_lens'] r_ctx_lens = this_batch['r_ctx_lens'] men_lens = this_batch['men_lens'] l_ctx_emb = word_embedding_layer(l_ctx) # (B, S, word_emb) mentions_emb = word_embedding_layer(mentions) # (B, S, word_emb) r_ctx_emb = word_embedding_layer(r_ctx) # (B, S, word_emb) types_emb = type_embedding_layer(types) # (89, word_emb) l_ctx_lstm, r_ctx_lstm = ctx_lstm(l_ctx_emb, r_ctx_emb, l_ctx_lens, r_ctx_lens) ctx_rep, men_rep = ctx_att(l_ctx_lstm, r_ctx_lstm, mentions_emb, l_ctx_lens, r_ctx_lens, men_lens, types_emb) loss = warp_loss(ctx_rep, men_rep, labels, types_emb) if step > 100: print('loss: ', loss.data[0]) logger.scalar_summary('loss', loss.data[0], step) loss.backward() ctx_lstm_before_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()] ctx_att_before_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()] sig_before_step = [(tag, to_np(value)) for tag, value in warp_loss.named_parameters()] # clip_grad_norm(embedding_layer.parameters(), fg_config['clip_norm']) clip_grad_norm(ctx_lstm.parameters(), fg_config['clip_norm']) clip_grad_norm(ctx_att.parameters(), fg_config['clip_norm']) clip_grad_norm(warp_loss.parameters(), fg_config['clip_norm']) # for tag, value in embedding_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in att_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in model_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # for tag, value in ner_out_layer.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) # # for tag, value in crf.named_parameters(): # tag = tag.replace('.', '/') # if value is not None and value.grad is not None: # logger.histo_summary(tag, to_np(value), step) # logger.histo_summary(tag + '/grad', to_np(value.grad), step) ctx_lstm_opt.step() ctx_att_opt.step() sig_opt.step() grad_ratio_lst = [] ctx_lstm_after_step = [(tag, to_np(value)) for tag, value in ctx_lstm.named_parameters()] grad_ratio_lst.append((ctx_lstm_before_step, ctx_lstm_after_step)) ctx_att_after_step = [(tag, to_np(value)) for tag, value in ctx_att.named_parameters()] grad_ratio_lst.append((ctx_att_before_step, ctx_att_after_step)) sig_after_step = [(tag, to_np(value)) for tag, value in warp_loss.named_parameters()] grad_ratio_lst.append((sig_before_step, sig_after_step)) # h_after_step = [(tag, to_np(value)) for tag, value in ner_hw_layer.named_parameters()] # grad_ratio_lst.append((h_before_step, h_after_step)) # n_after_step = [(tag, to_np(value)) for tag, value in ner_out_layer.named_parameters()] # grad_ratio_lst.append((n_before_step, n_after_step)) # c_after_step = [(tag, to_np(value)) for tag, value in crf.named_parameters()] # grad_ratio_lst.append((c_before_step, c_after_step)) # q_after_step = [(tag, to_np(value)) for tag, value in q_emb_layer.named_parameters()] # utils.log_grad_ratio(logger, step, grad_ratio_lst)