if opt.task_sc: model_class.train() nsentences = len(train_data_index) piece_sentences = opt.batchSize if int( nsentences * 0.1 / opt.batchSize) == 0 else int(nsentences * 0.1 / opt.batchSize) * opt.batchSize for j in range(0, nsentences, opt.batchSize): words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class( train_feats['data'], train_tags['data'], train_class['data'], tag_to_idx, class_to_idx, train_data_index, j, opt.batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, enc_dec_focus=opt.enc_dec, device=opt.device) inputs = prepare_inputs_for_bert_xlnet( words, lens, tokenizer, cls_token_at_end=bool(opt.pretrained_model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2
def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device) else: words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device) inputs = prepare_inputs_for_bert_xlnet( words, lens, tokenizer, cls_token_at_end=bool(opt.pretrained_model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if opt.pretrained_model_type in ['xlnet'] else 0, pad_on_left=bool(opt.pretrained_model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if opt.pretrained_model_type in ['xlnet'] else 0, device=opt.device) if opt.enc_dec: opt.greed_decoding = True if opt.greed_decoding: tag_scores_1best, outputs_1best, encoder_info = model_tag.decode_greed( inputs, tags[:, 0:1], lens, with_snt_classifier=True) tag_loss = tag_loss_function( tag_scores_1best.contiguous().view( -1, len(tag_to_idx)), tags[:, 1:].contiguous().view(-1)) top_pred_slots = outputs_1best.cpu().numpy() else: beam_size = 2 beam_scores_1best, top_path_slots, encoder_info = model_tag.decode_beam_search( inputs, lens, beam_size, tag_to_idx, with_snt_classifier=True) top_pred_slots = [[item[0].item() for item in seq] for seq in top_path_slots] ppl = beam_scores_1best.cpu() / torch.tensor( lens, dtype=torch.float) tag_loss = ppl.exp().sum() #tags = tags[:, 1:].data.cpu().numpy() elif opt.crf: max_len = max(lens) masks = [([1] * l) + ([0] * (max_len - l)) for l in lens] masks = torch.tensor(masks, dtype=torch.uint8, device=opt.device) crf_feats, encoder_info = model_tag._get_lstm_features( inputs, lens, with_snt_classifier=True) tag_path_scores, tag_path = model_tag.forward(crf_feats, masks) tag_loss = model_tag.neg_log_likelihood(crf_feats, masks, tags) top_pred_slots = tag_path.data.cpu().numpy() else: tag_scores, encoder_info = model_tag(inputs, lens, with_snt_classifier=True) tag_loss = tag_loss_function( tag_scores.contiguous().view(-1, len(tag_to_idx)), tags.view(-1)) top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1) #tags = tags.data.cpu().numpy() if opt.task_sc: class_scores = model_class(encoder_info_filter(encoder_info)) class_loss = class_loss_function(class_scores, classes) if opt.multiClass: snt_probs = class_scores.data.cpu().numpy() else: snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1) losses.append([ tag_loss.item() / sum(lens), class_loss.item() / len(lens) ]) else: losses.append([tag_loss.item() / sum(lens), 0]) #classes = classes.data.cpu().numpy() for idx, pred_line in enumerate(top_pred_slots): length = lens[idx] pred_seq = [idx_to_tag[tag] for tag in pred_line][:length] lab_seq = [ idx_to_tag[tag] if type(tag) == int else tag for tag in raw_tags[idx] ] pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O']) label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP += 1 else: FP += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN += 1 input_line = words[idx] word_tag_line = [ input_line[_idx] + ':' + lab_seq[_idx] + ':' + pred_seq[_idx] for _idx in range(len(input_line)) ] if opt.task_sc: if opt.multiClass: pred_classes = [ idx_to_class[i] for i, p in enumerate(snt_probs[idx]) if p > 0.5 ] gold_classes = [ idx_to_class[i] for i in raw_classes[idx] ] for pred_class in pred_classes: if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 for gold_class in gold_classes: if gold_class not in pred_classes: FN2 += 1 gold_class_str = ';'.join(gold_classes) pred_class_str = ';'.join(pred_classes) else: pred_class = idx_to_class[snt_probs[idx]] if type(raw_classes[idx]) == int: gold_classes = {idx_to_class[raw_classes[idx]]} else: gold_classes = set(raw_classes[idx]) if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 FN2 += 1 gold_class_str = ';'.join(list(gold_classes)) pred_class_str = pred_class else: gold_class_str = '' pred_class_str = '' if opt.testing: f.write( str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') else: f.write(' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') if TP == 0: p, r, f = 0, 0, 0 else: p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / ( 2 * TP + FN + FP) mean_losses = np.mean(losses, axis=0) return mean_losses, p, r, f, 0 if 2 * TP2 + FN2 + FP2 == 0 else 100 * 2 * TP2 / ( 2 * TP2 + FN2 + FP2)
def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=False, device=opt.device) else: words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class(data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=False, device=opt.device) inputs = prepare_inputs_for_bert(words, lens) if opt.task_st == 'NN': tag_scores, class_scores = model_tag_and_class(inputs, lens) tag_loss = tag_loss_function(tag_scores.contiguous().view(-1, len(tag_to_idx)), tags.view(-1)) top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1) else: max_len = max(lens) masks = [([1] * l) + ([0] * (max_len - l)) for l in lens] masks = torch.tensor(masks, dtype=torch.uint8, device=opt.device) crf_feats, class_scores = model_tag_and_class(inputs, lens) tag_path_scores, tag_path = model_tag_and_class.crf_viterbi_decode(crf_feats, masks) tag_loss = model_tag_and_class.crf_neg_log_likelihood(crf_feats, masks, tags) top_pred_slots = tag_path.data.cpu().numpy() #tags = tags.data.cpu().numpy() if opt.task_sc: class_loss = class_loss_function(class_scores, classes) if opt.multiClass: snt_probs = class_scores.data.cpu().numpy() else: snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1) losses.append([tag_loss.item()/sum(lens), class_loss.item()/len(lens)]) else: losses.append([tag_loss.item()/sum(lens), 0]) #classes = classes.data.cpu().numpy() for idx, pred_line in enumerate(top_pred_slots): length = lens[idx] pred_seq = [idx_to_tag[tag] for tag in pred_line][:length] lab_seq = [idx_to_tag[tag] if type(tag) == int else tag for tag in raw_tags[idx]] pred_chunks = acc.get_chunks(['O']+pred_seq+['O']) label_chunks = acc.get_chunks(['O']+lab_seq+['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP += 1 else: FP += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN += 1 input_line = words[idx] word_tag_line = [input_line[_idx]+':'+lab_seq[_idx]+':'+pred_seq[_idx] for _idx in range(len(input_line))] if opt.task_sc: if opt.multiClass: pred_classes = [idx_to_class[i] for i,p in enumerate(snt_probs[idx]) if p > 0.5] gold_classes = [idx_to_class[i] for i in raw_classes[idx]] for pred_class in pred_classes: if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 for gold_class in gold_classes: if gold_class not in pred_classes: FN2 += 1 gold_class_str = ';'.join(gold_classes) pred_class_str = ';'.join(pred_classes) else: pred_class = idx_to_class[snt_probs[idx]] if type(raw_classes[idx]) == int: gold_classes = {idx_to_class[raw_classes[idx]]} else: gold_classes = set(raw_classes[idx]) if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 FN2 += 1 gold_class_str = ';'.join(list(gold_classes)) pred_class_str = pred_class else: gold_class_str = '' pred_class_str = '' if opt.testing: f.write(str(line_nums[idx])+' : '+' '.join(word_tag_line)+' <=> '+gold_class_str+' <=> '+pred_class_str+'\n') else: f.write(' '.join(word_tag_line)+' <=> '+gold_class_str+' <=> '+pred_class_str+'\n') if TP == 0: p, r, f = 0, 0, 0 else: p, r, f = 100*TP/(TP+FP), 100*TP/(TP+FN), 100*2*TP/(2*TP+FN+FP) mean_losses = np.mean(losses, axis=0) return mean_losses, p, r, f, 0 if 2*TP2+FN2+FP2 == 0 else 100*2*TP2/(2*TP2+FN2+FP2)