def decode(pred_path): pred_slot_tags, pred_intent_tags = data_reader.read_seqtag_data( pred_path, slot_tag_to_idx, intent_tag_to_idx) TP_1, FP_1, FN_1, TN_1 = 0.0, 0.0, 0.0, 0.0 TP_2, FP_2, FN_2, TN_2 = 0.0, 0.0, 0.0, 0.0 for idx, pred_line in enumerate(pred_slot_tags['data']): pred_seq = [idx_to_slot_tag[item] for item in pred_line] lab_seq = [ idx_to_slot_tag[item] for item in valid_slot_tags['data'][idx] ] pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O']) label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP_1 += 1 else: FP_1 += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN_1 += 1 for idx, pred_line in enumerate(pred_intent_tags['data']): pred_seq = [0] * len(intent_tag_to_idx) lab_seq = [0] * len(intent_tag_to_idx) for item in pred_line: pred_seq[item] = 1 for item in valid_intent_tags['data'][idx]: lab_seq[item] = 1 for k in range(len(pred_seq)): if pred_seq[k] == 1 and lab_seq[k] == 1: TP_2 += 1 if pred_seq[k] == 1 and lab_seq[k] == 0: FP_2 += 1 if pred_seq[k] == 0 and lab_seq[k] == 1: FN_2 += 1 if TP_1 == 0: p_1, r_1, f_1 = 0, 0, 0 else: p_1, r_1, f_1 = 100*TP_1/(TP_1+FP_1), 100*TP_1 / \ (TP_1+FN_1), 100*2*TP_1/(2*TP_1+FN_1+FP_1) if TP_2 == 0: p_2, r_2, f_2 = 0, 0, 0 else: p_2, r_2, f_2 = 100*TP_2/(TP_2+FP_2), 100*TP_2 / \ (TP_2+FN_2), 100*2*TP_2/(2*TP_2+FN_2+FP_2) return (p_1, r_1, f_1), (p_2, r_2, f_2)
def decode(data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: words, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device) else: words, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=opt.enc_dec, device=opt.device) inputs = prepare_inputs_for_bert_xlnet( words, lens, tokenizer, cls_token_at_end=bool(opt.pretrained_model_type in ['xlnet'] ), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, cls_token_segment_id=2 if opt.pretrained_model_type in ['xlnet'] else 0, pad_on_left=bool(opt.pretrained_model_type in ['xlnet']), # pad on the left for xlnet pad_token_segment_id=4 if opt.pretrained_model_type in ['xlnet'] else 0, device=opt.device) if opt.enc_dec: opt.greed_decoding = True if opt.greed_decoding: tag_scores_1best, outputs_1best, encoder_info = model_tag.decode_greed( inputs, tags[:, 0:1], lens, with_snt_classifier=True) tag_loss = tag_loss_function( tag_scores_1best.contiguous().view( -1, len(tag_to_idx)), tags[:, 1:].contiguous().view(-1)) top_pred_slots = outputs_1best.cpu().numpy() else: beam_size = 2 beam_scores_1best, top_path_slots, encoder_info = model_tag.decode_beam_search( inputs, lens, beam_size, tag_to_idx, with_snt_classifier=True) top_pred_slots = [[item[0].item() for item in seq] for seq in top_path_slots] ppl = beam_scores_1best.cpu() / torch.tensor( lens, dtype=torch.float) tag_loss = ppl.exp().sum() #tags = tags[:, 1:].data.cpu().numpy() elif opt.crf: max_len = max(lens) masks = [([1] * l) + ([0] * (max_len - l)) for l in lens] masks = torch.tensor(masks, dtype=torch.uint8, device=opt.device) crf_feats, encoder_info = model_tag._get_lstm_features( inputs, lens, with_snt_classifier=True) tag_path_scores, tag_path = model_tag.forward(crf_feats, masks) tag_loss = model_tag.neg_log_likelihood(crf_feats, masks, tags) top_pred_slots = tag_path.data.cpu().numpy() else: tag_scores, encoder_info = model_tag(inputs, lens, with_snt_classifier=True) tag_loss = tag_loss_function( tag_scores.contiguous().view(-1, len(tag_to_idx)), tags.view(-1)) top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1) #tags = tags.data.cpu().numpy() if opt.task_sc: class_scores = model_class(encoder_info_filter(encoder_info)) class_loss = class_loss_function(class_scores, classes) if opt.multiClass: snt_probs = class_scores.data.cpu().numpy() else: snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1) losses.append([ tag_loss.item() / sum(lens), class_loss.item() / len(lens) ]) else: losses.append([tag_loss.item() / sum(lens), 0]) #classes = classes.data.cpu().numpy() for idx, pred_line in enumerate(top_pred_slots): length = lens[idx] pred_seq = [idx_to_tag[tag] for tag in pred_line][:length] lab_seq = [ idx_to_tag[tag] if type(tag) == int else tag for tag in raw_tags[idx] ] pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O']) label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP += 1 else: FP += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN += 1 input_line = words[idx] word_tag_line = [ input_line[_idx] + ':' + lab_seq[_idx] + ':' + pred_seq[_idx] for _idx in range(len(input_line)) ] if opt.task_sc: if opt.multiClass: pred_classes = [ idx_to_class[i] for i, p in enumerate(snt_probs[idx]) if p > 0.5 ] gold_classes = [ idx_to_class[i] for i in raw_classes[idx] ] for pred_class in pred_classes: if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 for gold_class in gold_classes: if gold_class not in pred_classes: FN2 += 1 gold_class_str = ';'.join(gold_classes) pred_class_str = ';'.join(pred_classes) else: pred_class = idx_to_class[snt_probs[idx]] if type(raw_classes[idx]) == int: gold_classes = {idx_to_class[raw_classes[idx]]} else: gold_classes = set(raw_classes[idx]) if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 FN2 += 1 gold_class_str = ';'.join(list(gold_classes)) pred_class_str = pred_class else: gold_class_str = '' pred_class_str = '' if opt.testing: f.write( str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') else: f.write(' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') if TP == 0: p, r, f = 0, 0, 0 else: p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / ( 2 * TP + FN + FP) mean_losses = np.mean(losses, axis=0) return mean_losses, p, r, f, 0 if 2 * TP2 + FN2 + FP2 == 0 else 100 * 2 * TP2 / ( 2 * TP2 + FN2 + FP2)
if item[0] == 'B': B_type_list.append(item[2:]) if item[0] == 'I': I_type_list.append(item[2:]) type_list = [] for item in B_type_list: if item in I_type_list: type_list.append((item, 3)) else: type_list.append((item, 1)) out_slot_path = os.path.join(result_root, 'submission_slot.csv') line_id = 1 with open(out_slot_path, 'w') as f: f.write('Id,Expected\n') for line in test_slot_tags['data']: all_chunks = [] sentence_length = len(line) for Type, slot_length in type_list: for i in range(1, sentence_length + 1): for j in range(i, max(i + slot_length, sentence_length + 1)): all_chunks.append((i, j, Type)) lab_seq = [idx_to_slot_tag[slot] for slot in line] label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for k in range(len(all_chunks)): if all_chunks[k] in label_chunks: f.write(str(line_id) + ',1\n') else: f.write(str(line_id) + ',0\n') line_id += 1
def decode(data_feats, data_slot_tags, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: inputs, slot_tags, lens, line_nums, raw_words = data_reader.get_minibatch_with_unali_act( data_feats, data_slot_tags, word_to_idx, slot_tag_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, keep_order=opt.testing, raw_word=True, enc_dec_focus=opt.enc_dec, device=opt.device) else: inputs, slot_tags, lens = data_reader.get_minibatch_with_unali_act( data_feats, data_slot_tags, word_to_idx, slot_tag_to_idx, data_index, j, opt.test_batchSize, add_start_end=opt.bos_eos, keep_order=opt.testing, raw_word=False, enc_dec_focus=opt.enc_dec, device=opt.device) # slot tag if opt.enc_dec: opt.greed_decoding = True #True, False if opt.greed_decoding: slot_tag_scores_1best, pred_slot_tag_1best, h_t_c_t = model_tag.decode_greed( inputs, slot_tags[:, 0:1], lens) slot_tag_loss = slot_tag_loss_function( slot_tag_scores_1best.contiguous().view( -1, len(slot_tag_to_idx)), slot_tags[:, 1:].contiguous().view(-1)) pred_slot_tag_1best = pred_slot_tag_1best.cpu().numpy() else: beam_size = 2 beam_tag_scores_1best, pred_slot_tag_1best, _ = model_tag.decode_beam_search( inputs, lens, beam_size, slot_tag_to_idx) ppl = beam_tag_scores_1best.cpu() / torch.tensor( lens, dtype=torch.float) slot_tag_loss = ppl.exp().sum() pred_slot_tag_1best = [[word[0].item() for word in line] for line in pred_slot_tag_1best] slot_tags = slot_tags[:, 1:].data.cpu().numpy() elif opt.crf: max_len = max(lens) masks = [([1] * l) + ([0] * (max_len - l)) for l in lens] masks = torch.tensor(masks, dtype=torch.uint8, device=opt.device) crf_feats, h_t_c_t = model_tag._get_lstm_features(inputs, lens) slot_tag_path_scores, slot_tag_path = model_tag.forward( crf_feats, masks) slot_tag_loss = model_tag.neg_log_likelihood( crf_feats, masks, slot_tags) pred_slot_tag_1best = slot_tag_path.data.cpu().numpy() slot_tags = slot_tags.data.cpu().numpy() else: slot_tag_scores, h_t_c_t = model_tag(inputs, lens) slot_tag_loss = slot_tag_loss_function( slot_tag_scores.contiguous().view(-1, len(slot_tag_to_idx)), slot_tags.view(-1)) pred_slot_tag_1best = slot_tag_scores.data.cpu().numpy( ).argmax(axis=-1) slot_tags = slot_tags.data.cpu().numpy() losses.append(slot_tag_loss.item() / sum(lens)) inputs = inputs.data.cpu().numpy() for idx, pred_line in enumerate(pred_slot_tag_1best): length = lens[idx] # slot tag pred_seq = [] for slot_tag in pred_line[:length]: slot_tag = idx_to_slot_tag[slot_tag] pred_seq.append(slot_tag) lab_seq = [] for slot_tag in slot_tags[idx][:length]: slot_tag = idx_to_slot_tag[slot_tag] lab_seq.append(slot_tag) pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O']) label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP += 1 else: FP += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN += 1 if opt.testing: input_line = raw_words[idx] else: input_line = [idx_to_word[word] for word in inputs[idx]][:length] word_tag_line = [ input_line[_idx] + ':' + pred_seq[_idx] for _idx in range(len(input_line)) ] if opt.testing: f.write( str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) + '\n') else: f.write(' '.join(word_tag_line) + '\n') if TP == 0: p, r, f = 0, 0, 0 else: p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / ( 2 * TP + FN + FP) mean_losses = np.mean(losses, axis=0) return mean_losses, p, r, f
def decode(sen_feats, data_feats, data_tags, data_class, output_path): data_index = np.arange(len(data_feats)) losses = [] TP, FP, FN, TN = 0.0, 0.0, 0.0, 0.0 TP2, FP2, FN2, TN2 = 0.0, 0.0, 0.0, 0.0 with open(output_path, 'w') as f: for j in range(0, len(data_index), opt.test_batchSize): if opt.testing: inputs, tags, raw_tags, classes, raw_classes, lens, line_nums = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, word_to_idx, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=False, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=False, device=opt.device) input_sens = data_reader.get_sen_minibatch(sen_feats, train_data_index, j, opt.batchSize, device=opt.device) else: inputs, tags, raw_tags, classes, raw_classes, lens = data_reader.get_minibatch_with_class( data_feats, data_tags, data_class, word_to_idx, tag_to_idx, class_to_idx, data_index, j, opt.test_batchSize, add_start_end=False, multiClass=opt.multiClass, keep_order=opt.testing, enc_dec_focus=False, device=opt.device) input_sens = data_reader.get_sen_minibatch(sen_feats, data_index, j, opt.batchSize, device=opt.device) if opt.crf: max_len = max(lens) masks = [([1] * l) + ([0] * (max_len - l)) for l in lens] masks = torch.tensor(masks, dtype=torch.uint8, device=opt.device) crf_feats, encoder_info = model_tag._get_lstm_features( input_sens, lens, with_snt_classifier=True) tag_path_scores, tag_path = model_tag.forward(crf_feats, masks) tag_loss = model_tag.neg_log_likelihood(crf_feats, masks, tags) top_pred_slots = tag_path.data.cpu().numpy() else: tag_scores, encoder_info = model_tag(inputs, lens, with_snt_classifier=True) tag_loss = tag_loss_function( tag_scores.contiguous().view(-1, len(tag_to_idx)), tags.view(-1)) top_pred_slots = tag_scores.data.cpu().numpy().argmax(axis=-1) # tags = tags.data.cpu().numpy() if opt.task_sc: class_scores = model_class(encoder_info_filter(encoder_info)) class_loss = class_loss_function(class_scores, classes) if opt.multiClass: snt_probs = class_scores.data.cpu().numpy() else: snt_probs = class_scores.data.cpu().numpy().argmax(axis=-1) losses.append([ tag_loss.item() / sum(lens), class_loss.item() / len(lens) ]) else: losses.append([tag_loss.item() / sum(lens), 0]) inputs = inputs.data.cpu().numpy() # classes = classes.data.cpu().numpy() for idx, pred_line in enumerate(top_pred_slots): length = lens[idx] pred_seq = [idx_to_tag[tag] for tag in pred_line][:length] lab_seq = [ idx_to_tag[tag] if type(tag) == int else tag for tag in raw_tags[idx] ] pred_chunks = acc.get_chunks(['O'] + pred_seq + ['O']) label_chunks = acc.get_chunks(['O'] + lab_seq + ['O']) for pred_chunk in pred_chunks: if pred_chunk in label_chunks: TP += 1 else: FP += 1 for label_chunk in label_chunks: if label_chunk not in pred_chunks: FN += 1 input_line = [idx_to_word[word] for word in inputs[idx]][:length] word_tag_line = [ input_line[_idx] + ':' + lab_seq[_idx] + ':' + pred_seq[_idx] for _idx in range(len(input_line)) ] if opt.task_sc: if opt.multiClass: pred_classes = [ idx_to_class[i] for i, p in enumerate(snt_probs[idx]) if p > 0.5 ] gold_classes = [ idx_to_class[i] for i in raw_classes[idx] ] for pred_class in pred_classes: if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 for gold_class in gold_classes: if gold_class not in pred_classes: FN2 += 1 gold_class_str = ';'.join(gold_classes) pred_class_str = ';'.join(pred_classes) else: pred_class = idx_to_class[snt_probs[idx]] if type(raw_classes[idx]) == int: gold_classes = {idx_to_class[raw_classes[idx]]} else: gold_classes = set(raw_classes[idx]) if pred_class in gold_classes: TP2 += 1 else: FP2 += 1 FN2 += 1 gold_class_str = ';'.join(list(gold_classes)) pred_class_str = pred_class else: gold_class_str = '' pred_class_str = '' if opt.testing: f.write( str(line_nums[idx]) + ' : ' + ' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') else: f.write(' '.join(word_tag_line) + ' <=> ' + gold_class_str + ' <=> ' + pred_class_str + '\n') if TP == 0: p, r, f = 0, 0, 0 else: p, r, f = 100 * TP / (TP + FP), 100 * TP / (TP + FN), 100 * 2 * TP / ( 2 * TP + FN + FP) if TP2 == 0: cp, cr, cf = 0, 0, 0 else: cp, cr, cf = 100 * TP2 / (TP2 + FP2), 100 * TP2 / ( TP2 + FN2), 100 * 2 * TP2 / (2 * TP2 + FN2 + FP2) mean_losses = np.mean(losses, axis=0) return mean_losses, p, r, f, cp, cr, cf # 0 if 2*TP2+FN2+FP2 == 0 else 100*2*TP2/(2*TP2+FN2+FP2)