def test(training_data_index_seq, training_masks_seq, test_lines_seq, model, label_sent, label_mask, id2word): # model.load_state_dict(torch.load(save_model_path)) model.eval() '''这里的minibatches_idx已经考虑了remain的样本''' data_id = 0 for training_data_index, training_masks, test_lines in zip( training_data_index_seq, training_masks_seq, test_lines_seq): minibatches_idx = get_minibatches_idx( len(training_data_index), minibatch_size=config['batch_size'], shuffle=False) n_test_remain = len(training_data_index) % config['batch_size'] pred_types = [] pred_confs = [] pred_others = [] # Text_Lines = [] with torch.no_grad(): overall_mean = -100.0 for i, minibatch in enumerate(minibatches_idx): '''这里是决定一个mibatch之后才去pad, 感觉有点低效''' sentence_batch, mask_batch = get_mask_test( training_data_index, training_masks, minibatch) sentence_batch = autograd.Variable( torch.cuda.LongTensor(sentence_batch)) mask_batch = autograd.Variable( torch.cuda.FloatTensor(mask_batch)) '''这儿好像是将一个minibatch里面的samples按照长度降序排列''' '''dim=-1好像是指the last dimension''' lengths_batch = mask_batch.sum(dim=-1) # is a list seq_lengths, seq_idx = lengths_batch.sort( 0, descending=True) # a list seq_lengths = seq_lengths.int().data.tolist() sentence_batch = sentence_batch[seq_idx] mask_batch = mask_batch[seq_idx] '''把reorder的seq_idx排回原来的样子''' seq_idx_2_list = seq_idx.int().data.tolist() return_map = {val: i for i, val in enumerate(seq_idx_2_list)} recover_seq_idx = [ return_map[i] for i in range(len(seq_idx_2_list)) ] recover_seq_idx = autograd.Variable( torch.cuda.LongTensor(np.array(recover_seq_idx))) # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()] '''targets_batch is array''' # targets_batch = targets_batch[list(seq_idx.cpu().numpy())] sent_list = recover_pytorch_idmatrix_2_text( sentence_batch, id2word) bert_rep_batch = [] if use_bert: for sent in sent_list: bert_rep = sent_to_embedding_last4( sent, bert_tokenizer, bert_model, True) bert_rep_batch.append(bert_rep.reshape(1, -1)) bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768) tag_scores, tag_scores_task2 = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch) '''recover the order''' tag_scores = tag_scores[recover_seq_idx] tag_scores_task2 = (tag_scores_task2.reshape( len(minibatch), 4, 4))[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # print('recover_seq_idx:',recover_seq_idx) # tag_scores_task2 = tag_scores_task2[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # exit(0) tag_scores_2_array = tag_scores.cpu().numpy() mean = np.mean(tag_scores_2_array) pred_labels = np.where(tag_scores_2_array > mean, 1, 0) # 17.10/ 33.5 pred_conf = tag_scores_2_array '''recover the order''' pred_other = tag_scores_task2.cpu().numpy() if i < len(minibatches_idx) - 1: pred_types.append(pred_labels) pred_confs.append(pred_conf) pred_others.append(pred_other) # Text_Lines+=text_lines_batch else: pred_types.append(pred_labels[-n_test_remain:]) pred_confs.append(pred_conf[-n_test_remain:]) pred_others.append(pred_other[-n_test_remain:]) # Text_Lines+=text_lines_batch[-n_test_remain:] pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) # test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(pred_types, np.array(testing_labels)) # print('test over, test_mean_f1:', test_mean_f1, 'test_weight_f1:', test_weight_f1) '''starting generate official output''' min_mean_frame = 100.0 output_file_path = output_file_head + output_file_path_codes[ data_id] + '.json' print('generating ...', output_file_path) mean_frame = generate_2019_official_output(test_lines, output_file_path, pred_types, pred_confs, pred_others) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print('\t\t\t test over, min_mean_frame:', min_mean_frame) validate_output_schema(output_file_path, 'LoReHLT19-schema_V1.json') data_id += 1
def test(training_data_index, training_masks, testing_labels, model, label_sent, label_mask, id2word): model.eval() '''这里的minibatches_idx已经考虑了remain的样本''' # output_file_path = '/scratch/wyin3/dickens_save_dataset/LORELEI/il3_Uyghur/il3_system_output.json' output_file_path = 'il9_system_output.json' minibatches_idx = get_minibatches_idx(len(training_data_index), minibatch_size=config['batch_size'], shuffle=False) n_test_remain = len(training_data_index) % config['batch_size'] pred_types = [] pred_confs = [] pred_others = [] # Text_Lines = [] with torch.no_grad(): overall_mean = -100.0 for i, minibatch in enumerate(minibatches_idx): '''这里是决定一个mibatch之后才去pad, 感觉有点低效''' sentence_batch, mask_batch, label_batch = get_mask( training_data_index, training_masks, testing_labels, minibatch) sentence_batch = autograd.Variable( torch.cuda.LongTensor(sentence_batch)) mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch)) '''这儿好像是将一个minibatch里面的samples按照长度降序排列''' '''dim=-1好像是指the last dimension''' lengths_batch = mask_batch.sum(dim=-1) # is a list seq_lengths, seq_idx = lengths_batch.sort( 0, descending=True) # a list seq_lengths = seq_lengths.int().data.tolist() sentence_batch = sentence_batch[seq_idx] mask_batch = mask_batch[seq_idx] '''把reorder的seq_idx排回原来的样子''' seq_idx_2_list = seq_idx.int().data.tolist() return_map = {val: i for i, val in enumerate(seq_idx_2_list)} recover_seq_idx = [ return_map[i] for i in range(len(seq_idx_2_list)) ] recover_seq_idx = autograd.Variable( torch.cuda.LongTensor(np.array(recover_seq_idx))) # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()] '''targets_batch is array''' # targets_batch = targets_batch[list(seq_idx.cpu().numpy())] sent_list = recover_pytorch_idmatrix_2_text( sentence_batch, id2word) bert_rep_batch = [] bert2_rep_batch = [] if use_bert: for sent in sent_list: bert_rep = sent_to_embedding_last4(sent, bert_tokenizer, bert_model, True) bert_rep_batch.append(bert_rep.reshape(1, -1)) bert2_rep = sent_to_embedding_last4( sent, bert2_tokenizer, bert2_model, True) bert2_rep_batch.append(bert2_rep.reshape(1, -1)) bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768) bert2_rep_batch = torch.cat(bert2_rep_batch, 0) #(batch, 768) tag_scores, tag_scores_task2 = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch, bert2_rep_batch) '''recover the order''' tag_scores = tag_scores[recover_seq_idx] tag_scores_task2 = (tag_scores_task2.reshape(len(minibatch), 4, 4))[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # print('recover_seq_idx:',recover_seq_idx) # tag_scores_task2 = tag_scores_task2[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # exit(0) tag_scores_2_array = tag_scores.cpu().numpy() mean = np.mean(tag_scores_2_array) pred_labels = np.where(tag_scores_2_array > mean, 1, 0) # 17.10/ 33.5 pred_conf = tag_scores_2_array '''recover the order''' pred_other = tag_scores_task2.cpu().numpy() if i < len(minibatches_idx) - 1: pred_types.append(pred_labels) pred_confs.append(pred_conf) pred_others.append(pred_other) # Text_Lines+=text_lines_batch else: pred_types.append(pred_labels[-n_test_remain:]) pred_confs.append(pred_conf[-n_test_remain:]) pred_others.append(pred_other[-n_test_remain:]) # Text_Lines+=text_lines_batch[-n_test_remain:] pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( pred_types, np.array(testing_labels)) print('test over, test_mean_f1:', test_mean_f1, 'test_weight_f1:', test_weight_f1)
def train(task1_data, task2_data, test_data, label_sent, label_mask, id2word, epoch_num, model, loss_function, optimizer): '''combine train set and dev set''' ''' task1_data,task2_data,test_data, ''' training_data_index, training_masks, training_labels = task1_data training_data_task2_index, training_task2_masks, training_task2_labels, train_task2_other_labels = task2_data testing_data_index, testing_masks, test_lines = test_data label_sent = autograd.Variable(torch.cuda.LongTensor(label_sent)) label_mask = autograd.Variable(torch.cuda.FloatTensor(label_mask)) print("training...") iter = 0 for epoch in range(epoch_num): print('current epoch: ', epoch) minibatches_idx = get_minibatches_idx( len(training_data_index), minibatch_size=config['batch_size'], shuffle=True) minibatches_idx_task2 = get_minibatches_idx( len(training_data_task2_index), minibatch_size=config['batch_size'], shuffle=True) for i, minibatch in enumerate(minibatches_idx): model.train() '''这里是决定一个mibatch之后才去pad, 感觉有点低效''' sentence_batch, mask_batch, targets_batch = get_mask( training_data_index, training_masks, training_labels, minibatch) sentence_batch = autograd.Variable( torch.cuda.LongTensor(sentence_batch)) targets_batch = autograd.Variable( torch.cuda.FloatTensor(targets_batch)) mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch)) '''dim=-1好像是指the last dimension''' lengths_batch = mask_batch.sum(dim=-1) # is a list seq_lengths, seq_idx = lengths_batch.sort( 0, descending=True) # a list seq_lengths = seq_lengths.int().data.tolist() sentence_batch = sentence_batch[seq_idx] targets_batch = targets_batch[seq_idx] mask_batch = mask_batch[seq_idx] model.zero_grad() '''Bert''' # sentence_numpy = sentence_batch.cpu().array() # bert_rep_batch = [] # for i in range(config['batch_size']): # sent_str = '' # for id in list(sentence_numpy[i]): # if id !=0: # sent_str+=id2word.get(id)+' ' sent_list = recover_pytorch_idmatrix_2_text( sentence_batch, id2word) bert_rep_batch = [] if use_bert: for sent in sent_list: bert_rep = sent_to_embedding_last4(sent, bert_tokenizer, bert_model, True) bert_rep_batch.append(bert_rep.reshape(1, -1)) bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768) tag_scores, _ = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch) '''Binary Cross Entropy''' temp_loss_matrix = torch_where( targets_batch[:, :-1].reshape(-1) < 1, 1.0 - tag_scores[:, :-1].reshape(-1), tag_scores[:, :-1].reshape(-1)) loss = -torch.mean(torch.log(temp_loss_matrix)) loss.backward() optimizer.step() '''task2''' if i < len(minibatches_idx_task2): model.train() minibatch_task2 = minibatches_idx_task2[i] '''这里是决定一个mibatch之后才去pad, 感觉有点低效''' sentence_batch, mask_batch, targets_batch, others_batch = get_mask_task2( training_data_task2_index, training_task2_masks, training_task2_labels, train_task2_other_labels, minibatch_task2) sentence_batch = autograd.Variable( torch.cuda.LongTensor(sentence_batch)) targets_batch = autograd.Variable( torch.cuda.FloatTensor(targets_batch)) mask_batch = autograd.Variable( torch.cuda.FloatTensor(mask_batch)) others_batch = autograd.Variable( torch.cuda.LongTensor(others_batch)) '''dim=-1好像是指the last dimension''' lengths_batch = mask_batch.sum(dim=-1) # is a list seq_lengths, seq_idx = lengths_batch.sort( 0, descending=True) # a list seq_lengths = seq_lengths.int().data.tolist() sentence_batch = sentence_batch[seq_idx] targets_batch = targets_batch[seq_idx] mask_batch = mask_batch[seq_idx] others_batch = others_batch[seq_idx] model.zero_grad() sent_list = recover_pytorch_idmatrix_2_text( sentence_batch, id2word) bert_rep_batch = [] if use_bert: for sent in sent_list: bert_rep = sent_to_embedding_last4( sent, bert_tokenizer, bert_model, True) bert_rep_batch.append(bert_rep.reshape(1, -1)) bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768) tag_scores, tag_scores_task2 = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch) # print('tag_scores_task2:',tag_scores_task2) '''Binary Cross Entropy''' temp_loss_matrix = torch_where( targets_batch[:, :-1].reshape(-1) < 1, 1.0 - tag_scores[:, :-1].reshape(-1), tag_scores[:, :-1].reshape(-1)) loss_task1 = -torch.mean(torch.log(temp_loss_matrix)) '''task2 loss''' other_label_scores = tag_scores_task2.index_select( 1, others_batch.view(-1)) loss_task2 = -torch.mean(torch.log(other_label_scores)) # print('loss_task1:',loss_task1) # print('loss_task2:', loss_task2) loss = loss_task1 + loss_task2 loss.backward() optimizer.step() iter += 1 if iter % 20 == 0: print(iter, ' loss: ', loss) # if epoch == 3: # torch.save(model.state_dict(), 'models_'+str(iter)+'.pt') '''test after one epoch''' # torch.save(model.state_dict(), save_model_path) # print('model saved succeed. train over') # return # else: if epoch > 18 and (epoch + 1) % 10 == 0: print('testing....') test(testing_data_index, testing_masks, test_lines, model, label_sent, label_mask, id2word)
def test(training_data_index, training_masks, model, label_sent, label_mask, id2word): model.eval() '''这里的minibatches_idx已经考虑了remain的样本''' # output_file_path = '/scratch/wyin3/dickens_save_dataset/LORELEI/il3_Uyghur/il3_system_output.json' output_file_path = 'il3_uyghur_system_output.json' # minibatches_idx = get_minibatches_idx(len(training_data_index), minibatch_size=config['batch_size'], shuffle=False) # n_test_remain = len(training_data_index)%config['batch_size'] pred_types = [] pred_confs = [] pred_others = [] # Text_Lines = [] with torch.no_grad(): '''这里是决定一个mibatch之后才去pad, 感觉有点低效''' sentence_batch, mask_batch= get_mask_demo(training_data_index, training_masks, config['batch_size']) sentence_batch = autograd.Variable(torch.cuda.LongTensor(sentence_batch)) mask_batch = autograd.Variable(torch.cuda.FloatTensor(mask_batch)) '''这儿好像是将一个minibatch里面的samples按照长度降序排列''' '''dim=-1好像是指the last dimension''' lengths_batch = mask_batch.sum(dim=-1) # is a list seq_lengths, seq_idx = lengths_batch.sort(0, descending=True) # a list seq_lengths = seq_lengths.int().data.tolist() sentence_batch = sentence_batch[seq_idx] mask_batch = mask_batch[seq_idx] '''把reorder的seq_idx排回原来的样子''' seq_idx_2_list = seq_idx.int().data.tolist() return_map = {val:i for i, val in enumerate(seq_idx_2_list)} recover_seq_idx =[return_map[i] for i in range(len(seq_idx_2_list))] recover_seq_idx=autograd.Variable(torch.cuda.LongTensor(np.array(recover_seq_idx))) # reordered_text_lines = [text_lines_batch[id] for id in seq_idx.int().data.tolist()] '''targets_batch is array''' # targets_batch = targets_batch[list(seq_idx.cpu().numpy())] sent_list = recover_pytorch_idmatrix_2_text(sentence_batch, id2word) bert_rep_batch = [] for sent in sent_list: # print('sent:', sent) bert_rep = sent_to_embedding_last4(sent, bert_tokenizer, bert_model, True) bert_rep_batch.append(bert_rep.reshape(1,-1)) bert_rep_batch = torch.cat(bert_rep_batch, 0) #(batch, 768) tag_scores, _ = model(sentence_batch, seq_lengths, mask_batch, label_sent, label_mask, bert_rep_batch) '''recover the order''' tag_scores = tag_scores[recover_seq_idx] # tag_scores_task2 = (tag_scores_task2.reshape(len(minibatch),4,4))[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # print('recover_seq_idx:',recover_seq_idx) # tag_scores_task2 = tag_scores_task2[recover_seq_idx] # print('tag_scores_task2:',tag_scores_task2) # exit(0) tag_scores_2_array = tag_scores.cpu().numpy() #(batch, 12) ''' type2label_id = {'crimeviolence':8, 'med':3, 'search':4, 'food':1, 'out-of-domain':11, 'infra':2, 'water':7, 'shelter':5, 'regimechange':9, 'evac':0, 'terrorism':10, 'utils':6} ''' typ_2_score = {} for id, typ in enumerate(['evac','food', 'infra','med', 'search', 'shelter', 'utils', 'water', 'crimeviolence', 'regimechange', 'terrorism']): typ_2_score[typ] = tag_scores_2_array[0][id] return typ_2_score