def evaluate(data, model, name): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = 1 start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 gazes = [] for batch_id in range(total_batch): with torch.no_grad(): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue gaz_list, batch_word, batch_biword, batch_wordlen, batch_label, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_bert, bert_mask = batchify_with_label( instance, data.HP_gpu, data.HP_num_layer, True) tag_seq, gaz_match = model(gaz_list, batch_word, batch_biword, batch_wordlen, layer_gaz, gaz_count, gaz_chars, gaz_mask, gazchar_mask, mask, batch_bert, bert_mask) gaz_list = [ data.gaz_alphabet.get_instance(id) for batchlist in gaz_match if len(batchlist) > 0 for id in batchlist ] gazes.append(gaz_list) if name == "dev": pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet) else: pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return speed, acc, p, r, f, pred_results, gazes
def evaluate(data, model, name, nbest=None, label_flag=True): ''' Evaluation of the model on a test data (or raw data wo labels) ''' if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == "test": instances = data.test_Ids elif name == "raw": instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval mode model.eval() batch_size = data.batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True, label_flag=label_flag) # here in ncrfpp code, there is a nbest condition that I wont code tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) if label_flag: pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) gold_results += gold_label else: pred_label = recover_pred_label(tag_seq, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label decode_time = time.time() - start_time speed = len(instances) / decode_time if label_flag: acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) else: acc, p, r, f = (0, 0, 0, 0) return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model, name): instances = [] if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids else: print("Error: wrong evaluate name,", name) pred_results = [] gold_results = [] # set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue pred_label, gold_label = -1, -1 if data.model_name == 'WC-LSTM_model': gaz_list, reverse_gaz_list, batch_char, batch_bichar, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label_3( instance, data.HP_gpu, data.HP_num_layer) tag_seq = model(gaz_list, reverse_gaz_list, batch_char, batch_charlen, mask) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_charrecover) elif data.model_name == 'CNN_model': gaz_list, batch_char, batch_bichar, batch_charlen, batch_label, layer_gaz, gaz_mask, mask = batchify_with_label_2( instance, data.HP_gpu, data.HP_num_layer, True) tag_seq = model(gaz_list, batch_char, batch_bichar, batch_charlen, layer_gaz, gaz_mask, mask) pred_label, gold_label = recover_label_2(tag_seq, batch_label, mask, data.label_alphabet) elif data.model_name == 'LSTM_model': gaz_list, batch_char, batch_bichar, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True) tag_seq = model(gaz_list, batch_char, batch_bichar, batch_charlen, mask) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_charrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return speed, acc, p, r, f, pred_results
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids instances_text = data.train_texts elif name == "dev": instances = data.dev_Ids instances_text = data.dev_texts elif name == 'test': instances = data.test_Ids instances_text = data.test_texts elif name == 'raw': instances = data.raw_Ids instances_text = data.raw_texts else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end > train_num: end = train_num instance = instances[start:end] instance_text = instances_text[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, batch_elmo_char = batchify_with_label(instance, instance_text, data.HP_gpu, False, data.sentence_classification) if nbest and not data.sentence_classification: scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest, batch_elmo_char) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist() ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:,:,0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, batch_elmo_char) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover, data.sentence_classification) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances)/decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest and not data.sentence_classification: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 total_label_loss = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, batch_type, mask_type, type_seq_lengths, word_seq_bert_tensor = batchify_with_label( instance, data.HP_gpu, data.label_alphabet_size, data.type_alphabet_size - 1) #) label_loss, tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask, batch_type, mask_type, type_seq_lengths, word_seq_bert_tensor) total_label_loss += label_loss.item() pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores, total_label_loss / total_batch
def evaluate(data, model, name): instances = None if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) start_time = time.time() pred_results = [] gold_results = [] ## set model in eval model model.eval() start = 0 end = start + data.HP_batch_size eval_epochs = [] while end <= len(instances): eval_epochs.append((start, end)) start = end end = end + data.HP_batch_size if end > len(instances) > start: eval_epochs.append((start, len(instances))) for idx, (start, end) in enumerate(eval_epochs): instance = instances[start:end] batch_word, batch_word_len, word_perm_idx, batch_word_recover, batch_label, mask, input_label_seq_tensor = batchify_with_label( instance, data.HP_gpu, data) with torch.no_grad(): tag_seq = model.evaluate(batch_word, batch_word_len, mask, input_label_seq_tensor) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data, batch_word_recover, data.use_crf) pred_results.extend(pred_label) gold_results.extend(gold_label) decode_time = time.time() - start_time report = classification_report(gold_results, pred_results, target_names=data.label_alphabet.instances) f_value = f1_score(gold_results, pred_results, average="macro") acc = accuracy_score(gold_results, pred_results) ner_acc, ner_p, ner_r, ner_f = get_ner_fmeasure(gold_results, pred_results, data.label_alphabet, data.tagScheme, name='ner', need_save_matrix=name == 'test') speed = len(instances) / decode_time return speed, acc, report, f_value, ner_acc, ner_p, ner_r, ner_f
def train_crf(): word2id, id2word = load_data(TOKEN_DATA) tag2id, id2tag = load_data(TAG_DATA) _, _, train_, x_train, y_train = generate_data(TRAIN_DATA, word2id, tag2id, max_len=hp.max_len) _, _, dev_seq_lens, x_dev, y_dev = generate_data(DEV_DATA, word2id, tag2id, max_len=hp.max_len) model_file = "logdir/model_crf" model = CRF() model.fit(x_train, y_train, template_file='model/module/templates.txt', model_file=model_file, max_iter=20) pre_seq = model.predict(x_dev, model_file=model_file) acc, p, r, f = get_ner_fmeasure(y_dev, pre_seq) print('acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}\n'.format(acc, p, r, f))
def evaluate(data, wordseq, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print "Error: wrong evaluate name,", name right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model wordseq.eval() model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask, _ = batchify_with_label(instance, data.HP_gpu, True) if nbest: hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, None, None) scores, nbest_tag_seq = model.decode_nbest(hidden, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist() ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:,:,0] else: hidden = wordseq.forward(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,batch_charrecover, None, None) tag_seq = model(hidden, mask) # print "tag:",tag_seq pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances)/decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def train_hmm(): word2id, id2word = load_data(TOKEN_DATA) tag2id, id2tag = load_data(TAG_DATA) _, _, train_, x_train, y_train = generate_data(TRAIN_DATA, word2id, tag2id, max_len=hp.max_len) _, _, dev_seq_lens, x_dev, y_dev = generate_data(DEV_DATA, word2id, tag2id, max_len=hp.max_len) model_file = "logdir/model_hmm" model = HMM() model.fit(x_train, y_train, model_file=model_file) pre_seq = model.predict(x_dev, model_file=model_file) acc, p, r, f = get_ner_fmeasure(y_dev, pre_seq) print('acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}\n'.format(acc, p, r, f))
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, True) if nbest: scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist() ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:,:,0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances)/decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model, name, external_pos={}): if name == "train": instances = data.train_Ids instance_texts = data.train_texts elif name == "dev": instances = data.dev_Ids instance_texts = data.dev_texts elif name == 'test': instances = data.test_Ids instance_texts = data.test_texts elif name == 'raw': instances = data.raw_Ids instance_texts = data.raw_texts else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 pred_results = [] gold_results = [] # set model in eval model model.eval() batch_size = 64 start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] instance_text = [sent[0] for sent in instance_texts[start:end]] if not instance: continue batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_label, mask, rearrange_instance_texts, batch_pos = \ batchify_with_label(instance_text, instance, data.HP_gpu) with torch.no_grad(): tag_seq = model.forward(rearrange_instance_texts, batch_word, batch_biword, batch_wordlen, mask, batch_pos, external_pos) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results) model.train() return speed, acc, p, r, f, pred_results
def evaluate(data, model, name, is_ner): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == "train_ner": instances = data.train_Ids elif name == "dev_ner": instances = data.dev_Ids elif name == 'test_ner': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = 1 start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue gaz_list, batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True) tag_seq = model(is_ner, gaz_list, batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print( "tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return speed, acc, p, r, f, pred_results
def evaluate(data, args, model, name): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(0) pred_results = [] gold_results = [] # set model in eval model model.eval() batch_size = args.batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue word_list, batch_char, batch_label, mask = batchify_with_label( instance, args.use_gpu) _, tag_seq = model(word_list, batch_char, mask) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results) return speed, acc, p, r, f, pred_results
def train(network='rnn'): word2id, id2word = load_data(TOKEN_DATA) tag2id, id2tag = load_data(TAG_DATA) x_train, y_train, seq_lens, _, _ = generate_data(TRAIN_DATA, word2id, tag2id, max_len=hp.max_len) x_dev, y_dev, dev_seq_lens, _, source_tag = generate_data(DEV_DATA, word2id, tag2id, max_len=hp.max_len) vocab_size = len(word2id) num_tags = len(tag2id) if network == "transformer": model = TransformerCRFModel(vocab_size, num_tags, is_training=True) elif network == 'rnn': model = BiRnnCRF(vocab_size, num_tags) elif network == 'cnn': model = CnnCRF(vocab_size, num_tags) elif network == 'match-pyramid': model = CnnCRF(vocab_size, num_tags) else: return sv = tf.train.Supervisor(graph=model.graph, logdir=logdir, save_model_secs=0) with sv.managed_session() as sess: for epoch in range(1, hp.num_epochs + 1): if sv.should_stop(): break train_loss = [] for x_batch, y_batch, len_batch in batch_data(x_train, y_train, seq_lens, hp.batch_size): feed_dict = {model.x: x_batch, model.y: y_batch, model.seq_lens: len_batch} loss, _ = sess.run([model.loss, model.train_op], feed_dict=feed_dict) train_loss.append(loss) dev_loss = [] predict_lists = [] for x_batch, y_batch, len_batch in batch_data(x_dev, y_dev, dev_seq_lens, hp.batch_size): feed_dict = {model.x: x_batch, model.y: y_batch, model.seq_lens: len_batch} loss, logits = sess.run([model.loss, model.logits], feed_dict) dev_loss.append(loss) transition = model.transition.eval(session=sess) pre_seq = model.predict(logits, transition, len_batch) pre_label = recover_label(pre_seq, len_batch, id2tag) predict_lists.extend(pre_label) train_loss_v = np.round(float(np.mean(train_loss)), 4) dev_loss_v = np.round(float(np.mean(dev_loss)), 4) print('****************************************************') acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists) print('epoch:\t{}\ttrain loss:\t{}\tdev loss:\t{}'.format(epoch, train_loss_v, dev_loss_v)) print('acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format(acc, p, r, f)) print('****************************************************\n\n')
def evaluate(data, model, args, name): if name == "train": instances = data.train_ids texts = data.train_texts elif name == "dev": instances = data.dev_ids texts = data.dev_texts elif name == 'test': instances = data.test_ids texts = data.test_texts else: print("Error: wrong evaluate name,", name) pred_results = [] gold_results = [] model.eval() batch_size = args.batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 o_label = data.label_alphabet.get_index("O") for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] text = texts[start:end] if not instance: continue input_ids, attention_mask, label_seq_tensor, loss_mask, crf_mask, scope = batchify( instance, args, o_label) tag_seq = model(input_ids, attention_mask, crf_mask, scope) pred_label, gold_label = recover_label(tag_seq, label_seq_tensor, attention_mask, data.label_alphabet) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagscheme) return speed, acc, p, r, f, pred_results
def evaluate(data, model, name, gpu): if name == "dev": instances = data.dev_Ids elif name == "test": instances = data.test_Ids else: print "Error: wrong evaluate name,", name right_token = 0 whole_token = 0 pred_results = [] gold_results = [] ## set model in eval model model.eval() for words, chars, label in instances: label = autograd.Variable(torch.LongTensor(label)) pred_score, tag_seq = model([words,chars], gpu) pred_label, gold_label = recover_label(tag_seq, label, data.label_alphabet) pred_results.append(pred_label) gold_results.append(gold_label) acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return acc, p, r, f
def evaluate(data, model, name): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print ("Error: wrong evaluate name,", name) pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = 10 start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = instances[start:end] if not instance: continue gaz_list,batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, True) tag_seq = model(gaz_list,batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print "tag:",tag_seq pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances)/decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return speed, acc, p, r, f, pred_results
def evaluate(dataUniqueId, dataIds, dataMask, dataVectors, dataLabels, dataKnowledgeVector, model, padding_label, labelRindex): ## 评价函数 pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = 1 start_time = time.time() eval_data = TensorDataset(dataUniqueId, dataIds, dataMask, dataVectors, dataLabels) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size) for uniqueIds, input_batch_list, mask, instanceVectors, input_batch_label in eval_dataloader: knowledgeExamples = [ dataKnowledgeVector[int(idx)] for idx in uniqueIds ] max_entity_num = max( [len(xxx) for yyy in knowledgeExamples for xxx in yyy]) batch_word, batch_knowledge, word_seq_tensor, batch_wordlen, batch_wordrecover, batch_label, mask, knowledge_mask = batchify_with_label( instanceVectors, input_batch_list, input_batch_label, knowledgeExamples, mask, GPU, padding_label, max_entity_num) tag_seq = model.forward(batch_word, batch_knowledge, mask, knowledge_mask, batch_label, batch_wordlen, dynanmic_meta_embedding) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, batch_wordrecover, labelRindex) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(dataIds) / decode_time fmeasure, acc = get_ner_fmeasure(gold_results, pred_results) return speed, fmeasure, acc
def evaluate(data, model, name, nbest=None): if name == "train": instances_1 = data.source_train_idx instances_2 = data.target_train_idx elif name == "dev-test": instances_1 = data.source_dev_idx instances_2 = data.target_dev_idx elif name == 'test': instances_1 = data.source_test_idx instances_2 = data.target_test_idx else: print("Error: wrong evaluate name,", name) exit(1) ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() nbest_pred_results_1 = [] pred_scores_1 = [] pred_results_1 = [] gold_results_1 = [] train_num = len(instances_1) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances_1[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, \ lm_seq_tensor, mask = batchify_with_label(instance, data.HP_gpu, True) if nbest: scores, nbest_tag_seq = model.decode_nbest('model2', batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.source_label_alphabet, batch_wordrecover) nbest_pred_results_1 += nbest_pred_result pred_scores_1 += scores[batch_wordrecover].cpu().data.numpy().tolist() ## select the best sequence to evalurate tag_seq_1 = nbest_tag_seq[:, :, 0] else: tag_seq_1 = model('model2', batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq_1, batch_label, mask, data.source_label_alphabet, batch_wordrecover) pred_results_1 += pred_label gold_results_1 += gold_label # decode_time = time.time() - start_time # speed = len(instances)/decode_time acc_1, p_1, r_1, f_1 = get_ner_fmeasure(gold_results_1, pred_results_1, "BMES") nbest_pred_results_2 = [] pred_scores_2 = [] pred_results_2 = [] gold_results_2 = [] train_num = len(instances_2) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances_2[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label,\ lm_seq_tensor, mask = batchify_with_label(instance, data.HP_gpu, True) if nbest: scores, nbest_tag_seq = model.decode_nbest('model4', batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.target_label_alphabet, batch_wordrecover) nbest_pred_results_2 += nbest_pred_result pred_scores_2 += scores[batch_wordrecover].cpu().data.numpy().tolist() ## select the best sequence to evalurate tag_seq_2 = nbest_tag_seq[:, :, 0] else: tag_seq_2 = model('model4', batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq_2, batch_label, mask, data.target_label_alphabet, batch_wordrecover) pred_results_2 += pred_label gold_results_2 += gold_label # decode_time = time.time() - start_time # speed = len(instances)/decode_time acc_2, p_2, r_2, f_2 = get_ner_fmeasure(gold_results_2, pred_results_2, "BMES") acc = [acc_1, acc_2] p = [p_1, p_2] r = [r_1, r_2] f = [f_1, f_2] pred_results = [pred_results_1, pred_results_2] pred_scores = [pred_scores_1, pred_scores_2] nbest_pred_results = [nbest_pred_results_1, nbest_pred_results_2] decode_time = time.time() - start_time speed = (len(instances_1) + len(instances_2)) / decode_time if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model, name, inference, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == "test": instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print "Error: wrong evaluate name,", name right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = 128#len(instances)#128 #For comparison against Vinyals et al. (2015) start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 pred_labels = {idtask:[] for idtask in range(data.HP_tasks)} gold_labels = {idtask:[] for idtask in range(data.HP_tasks)} nbest_pred_labels = {idtask:[] for idtask in range(data.HP_tasks)} nbest_pred_scores = {idtask:[] for idtask in range(data.HP_tasks)} if data.disjoint: treebank_indexes = {} for idxsample, sample in enumerate(instances): if sample[-1] not in treebank_indexes: treebank_indexes[sample[-1]] = [] treebank_indexes[sample[-1]].append(idxsample) for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, inference, True) if nbest: # scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, inference, nbest) tag_seq = [] for idtask, task_nbest_tag_seq in enumerate(nbest_tag_seq): nbest_pred_result = recover_nbest_label(task_nbest_tag_seq, mask, data.label_alphabet[idtask], batch_wordrecover) nbest_pred_labels[idtask] += nbest_pred_result nbest_pred_scores[idtask] += scores[idtask][batch_wordrecover].cpu().data.numpy().tolist() tag_seq.append(task_nbest_tag_seq[:,:,0]) else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, inference=inference) if not inference: for idtask, task_tag_seq in enumerate(tag_seq): pred_label, gold_label = recover_label(task_tag_seq, batch_label[idtask], mask, data.label_alphabet[idtask], batch_wordrecover, inference=inference) pred_labels[idtask]+=pred_label gold_labels[idtask]+=gold_label else: for idtask, task_tag_seq in enumerate(tag_seq): pred_label, _ = recover_label(task_tag_seq, None, mask, data.label_alphabet[idtask], batch_wordrecover, inference=inference) pred_labels[idtask]+=pred_label decode_time = time.time() - start_time speed = len(instances)/decode_time # Evaluating the different tasks tasks_results = [] range_tasks = data.HP_tasks if not inference else data.HP_main_tasks for idtask in range(range_tasks): valid_indexes=None if not inference: valid_gold_labels = [g for idx,g in enumerate(gold_labels[idtask]) if not data.disjoint or idx in treebank_indexes[data.inv_dataset_ids[idtask]]] valid_pred_labels = [p for idx,p in enumerate(pred_labels[idtask]) if not data.disjoint or idx in treebank_indexes[data.inv_dataset_ids[idtask]]] valid_indexes = [idx for idx,p in enumerate(pred_labels[idtask]) if not data.disjoint or idx in treebank_indexes[data.inv_dataset_ids[idtask]]] acc, p, r, f = get_ner_fmeasure(valid_gold_labels, valid_pred_labels, data.tagScheme) else: acc, p, r, f = -1, -1, -1,-1 if nbest: raise NotImplementedError else: tasks_results.append((speed,acc,p,r,f,pred_labels[idtask],nbest_pred_scores[idtask], valid_indexes)) return tasks_results
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, False, data.sentence_classification) if nbest and not data.sentence_classification: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover, data.sentence_classification) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time internal_acc, internal_p, internal_r, internal_f = get_ner_fmeasure( gold_results, pred_results, data.tagScheme) # Do a second evaluation using seqeval acc, p, r, f = seqeval_score(gold_results, pred_results) if acc != internal_acc: print( f"Accuracies disagree: {acc} (seqeval), {internal_acc} (NCRFpp), delta {internal_acc - acc}" ) if p != internal_p: print( f"Precisions disagree: {p} (seqeval), {internal_p} (NCRFpp), delta {internal_p - p}" ) if r != internal_r: print( f"Recalls disagree: {r} (seqeval), {internal_r} (NCRFpp), delta {internal_r - r}" ) if f != internal_f: print( f"F1s disagree: {f} (seqeval), {internal_f} (NCRFpp), delta {internal_f - f}" ) if nbest and not data.sentence_classification: return speed, (acc, p, r, f), (internal_acc, internal_p, internal_r, internal_f), nbest_pred_results, pred_scores return speed, (acc, p, r, f), (internal_acc, internal_p, internal_r, internal_f), pred_results, pred_scores
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) nbest_pred_results = [] pred_scores = [] nb_results = 3 pred_results = {i: [] for i in range(nb_results)} all_sorted_probs = {i: [] for i in range(nb_results)} gold_results = [] model.eval() batch_size = 128 start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True) if nbest: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) else: best_indices, sorted_probs = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) gold_label = None for i, best_index in enumerate(best_indices): specific_pred, gold_label = recover_label(best_index, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results[i] += specific_pred gold_results += gold_label for k, sorted_prob in enumerate(sorted_probs): all_sorted_probs[k] += sorted_prob.data.cpu().numpy().tolist() decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results[0], data.tagScheme) acc_instances = len(instances) acc_speed = decode_time if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores, all_sorted_probs, acc_instances, acc_speed
def evaluate(data, model, name, inference, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == "test": instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] # set model in eval model model.eval() # len(instances)#128 #For comparison against Vinyals et al. (2015) batch_size = 128 start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 # Variable to collect the preds and gold prediction in multitask # learning pred_labels = {idtask: [] for idtask in range(data.HP_tasks)} gold_labels = {idtask: [] for idtask in range(data.HP_tasks)} nbest_pred_labels = {idtask: [] for idtask in range(data.HP_tasks)} nbest_pred_scores = {idtask: [] for idtask in range(data.HP_tasks)} for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, inference, True) if nbest: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, inference, nbest) tag_seq = [] for idtask, task_nbest_tag_seq in enumerate(nbest_tag_seq): nbest_pred_result = recover_nbest_label( task_nbest_tag_seq, mask, data.label_alphabet[idtask], batch_wordrecover) nbest_pred_labels[idtask] += nbest_pred_result nbest_pred_scores[idtask] += scores[idtask][ batch_wordrecover].cpu().data.numpy().tolist() tag_seq.append(task_nbest_tag_seq[:, :, 0]) else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, inference=inference) if not inference: for idtask, task_tag_seq in enumerate(tag_seq): pred_label, gold_label = recover_label( task_tag_seq, batch_label[idtask], mask, data.label_alphabet[idtask], batch_wordrecover, inference=inference) pred_labels[idtask] += pred_label gold_labels[idtask] += gold_label else: if len(data.index_of_main_tasks) == data.HP_tasks: for idtask, task_tag_seq in enumerate(tag_seq): pred_label, _ = recover_label(task_tag_seq, None, mask, data.label_alphabet[idtask], batch_wordrecover, inference=inference) pred_labels[idtask] += pred_label else: index_task = data.index_of_main_tasks[0] for idtask, task_tag_seq in enumerate(tag_seq): pred_label, _ = recover_label( task_tag_seq, None, mask, data.label_alphabet[index_task], batch_wordrecover, inference=inference) pred_labels[idtask] += pred_label index_task += 1 decode_time = time.time() - start_time speed = len(instances) / decode_time tasks_results = [] range_tasks = data.HP_tasks if not inference else len( data.index_of_main_tasks) for idtask in range(range_tasks): if not inference: acc, p, r, f = get_ner_fmeasure(gold_labels[idtask], pred_labels[idtask], data.tagScheme) else: acc, p, r, f = -1, -1, -1, -1 if nbest: tasks_results.append( (speed, acc, p, r, f, nbest_pred_labels[idtask], nbest_pred_scores[idtask])) else: tasks_results.append((speed, acc, p, r, f, pred_labels[idtask], nbest_pred_scores[idtask])) return tasks_results
def evaluate(domain_tag, data, model, name, nbest=None): if name == "train": if domain_tag == "Source": instances = data.train_Ids_S elif domain_tag == "Target": instances = data.train_Ids_T elif name == "dev": if domain_tag == "Source": instances = data.dev_Ids_S elif domain_tag == "Target": instances = data.dev_Ids_T elif name == 'test': if domain_tag == "Source": instances = data.test_Ids_S elif domain_tag == "Target": instances = data.test_Ids_T elif name == 'raw': if domain_tag == "Target" or domain_tag == "Source": instances = data.raw_Ids if domain_tag == "Source": label_alphabet = data.label_alphabet_S entity_alphabet = data.entity_alphabet_S elif domain_tag == "Target": label_alphabet = data.label_alphabet_T entity_alphabet = data.entity_alphabet_T else: print("Error: wrong evaluate name,", name) exit(1) nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] gold_entity_results = [] pred_entity_results = [] gold_probs_results = [] pred_probs_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue original_words_batch, batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, batch_entity, mask = batchify_with_label( instance, data.HP_gpu, False, data.sentence_classification) if nbest and not data.sentence_classification: scores, nbest_tag_seq, entity_seq, atten_probs_seq = model.decode_nbest( original_words_batch, domain_tag, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest, batch_entity) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] else: tag_seq, entity_seq, atten_probs_seq = model( original_words_batch, domain_tag, batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # recover entity and probs results if entity_seq is not None: pred_entity, gold_entity = recover_label( entity_seq, batch_entity, mask, entity_alphabet, batch_wordrecover, data.sentence_classification) pred_entity_results += pred_entity gold_entity_results += gold_entity if atten_probs_seq is not None: pred_probs, gold_probs = recover_label( atten_probs_seq, batch_entity, mask, entity_alphabet, batch_wordrecover, data.sentence_classification) pred_probs_results += pred_probs gold_probs_results += gold_probs pred_label, gold_label = recover_label(tag_seq, batch_label, mask, label_alphabet, batch_wordrecover, data.sentence_classification) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time print("word acc:") acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if len(gold_entity_results) > 0: print("entity acc") entity_acc, _, _, _ = get_ner_fmeasure(gold_entity_results, pred_entity_results, "entity predict") if len(gold_probs_results) > 0: print("probs acc:") probs_acc, _, _, _ = get_ner_fmeasure(gold_probs_results, pred_probs_results, "probs predict") if nbest and not data.sentence_classification: return speed, acc, p, r, f, nbest_pred_results, pred_scores, pred_entity_results, pred_probs_results return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model, name): instances = None if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: logger.info("Error: wrong evaluate name,", name) exit(1) start_time = time.time() sent_pred_results = [] sent_gold_results = [] word_pred_results = [] word_gold_results = [] ## set model in eval model model.eval() for idx, instance in enumerate(instances): if not instance or len(instance) <= 1: continue batch_word, batch_word_len, word_perm_idx, batch_word_recover, batch_label, batch_sent_type, mask, sent_mask, input_label_seq_tensor, input_sent_type_tensor = batchify_with_label( instance, data.HP_gpu, data) with torch.no_grad(): words_tag_seq, sent_tag_seq = model.evaluate( None, batch_word, batch_word_len, batch_sent_type, mask, sent_mask, input_label_seq_tensor, input_sent_type_tensor, batch_word_recover, word_perm_idx, need_cat=False) # with codecs.open("attention_input.txt", "a", "utf-8") as w: # obj = ["".join([data.word_alphabet.get_instance(w_idx - 1) if w_idx != 0 else "" for w_idx in sent]) for # sent in batch_word.data.cpu().numpy().tolist()] # json.dump(obj, w) # w.write("\n") sent_pred, sent_gold, word_pred_label, word_gold_label = recover_label( words_tag_seq, sent_tag_seq, batch_label, batch_sent_type, mask, batch_word_recover, data.use_crf) sent_pred_results.extend(sent_pred) sent_gold_results.extend(sent_gold) word_pred_results.extend(word_pred_label) word_gold_results.extend(word_gold_label) decode_time = time.time() - start_time sent_f1 = f1_score(sent_gold_results, sent_pred_results, average="macro") sent_report = classification_report( sent_gold_results, sent_pred_results, target_names=data.sentence_type_alphabet.instances, digits=4) speed = len(instances) / decode_time word_acc = accuracy_score(word_gold_results, word_pred_results) word_f1 = f1_score(word_gold_results, word_pred_results, average='macro') word_report = classification_report( word_gold_results, word_pred_results, target_names=data.label_alphabet.instances, digits=4) word_ner_acc, word_ner_p, word_ner_r, word_ner_f = get_ner_fmeasure( word_gold_results, word_pred_results, data.label_alphabet, data.tagScheme, need_save_matrix=name == 'test') return speed, word_acc, word_report, word_f1, \ word_ner_acc, word_ner_p, word_ner_r, word_ner_f, sent_f1, sent_report
def evaluate(data, model, name, qleft=None,qright=None,batch_size=1): """ input: name: our current dataset for evaluation qleft,qright: the start and end point of the validation data set. When the validation data set is huge, we can use these parameters to sample the dataset output: speed: acc: accuracy p:precision r:recall f:f1 score pred_results:the predict results as a list of string p,r,f are useful when you switch to NER dataset """ if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 pred_results = [] gold_results = [] ## set model in eval mode model.examiner.eval() start_time = time.time() train_num = len(instances) total_batch = train_num//batch_size+1 if qleft==None: qleft=0 qright=int(total_batch/10) if name=="test": print("start test") for batch_id in range(qleft,qright): start = batch_id*batch_size end = (batch_id+1)*batch_size if end >train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, True) tag_seq = model.test(batch_word) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) batch_label,_tag_seq,_tag_prob,tag_mask,score,indices,scores_ref=model.pos_selection(batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances)/decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) return speed, acc, p, r, f, pred_results
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids # print(data.__dict__) # print('instances[1]:', instances[1]) # [word,features,char,label] else: print("Error: wrong evaluate name,", name) exit(1) # print('data.__dict__:', data.__dict__) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] # set torch_model in eval torch_model model.eval() batch_size = data.HP_batch_size # 10 # print(batch_size) start_time = time.time() train_num = len(instances) # 112 total_batch = train_num // batch_size + 1 # 把raw整体迭代完的batch数,不算epoch # print(total_batch) for batch_id in range(total_batch): # 每10个instance为1个要预测的batch start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num # 120>112,则end=112 instance = instances[start:end] # 预测数据最终处理的格式如instance[0],raw.bmes的标签全部设为O # print('instance:', len(instance), instance[0]) # for i in instance: # print(len(i[0]), i[0]) if not instance: continue # zero padding for word and char, 用batch中的max_seq_length # batchify_with_label:需要有实际的labels # 预测: batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, \ batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, False, data.sentence_classification) if nbest and not data.sentence_classification: # 预测结果的输入如下: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) # print('scores:', scores) # print('nbest_tag_seq:', nbest_tag_seq.shape, nbest_tag_seq) # 每个sen,每个word的标签给出预测,nbest = shape[-1] # recover_nbest_label:将顺序调整与input对应,输出nbest个预测结果 nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) # 调整pred_scores与input顺序一致 # select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] # 只选了nbest的第一列 # print('tag_seq:', tag_seq) # 最终预测结果的序列 else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) # recover_label:根据tag_seq还原出预测的label,根据batch_label还原出真实的label # batch_label:batch真实的label值 pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover, data.sentence_classification) # print('pred_label:', pred_label) # 预测的标签 # print('gold_label:', gold_label) # 真实的标签 pred_results += pred_label gold_results += gold_label # print('pred_results:', len(pred_results)) # print('gold_results:', len(gold_results)) print(name + ' ' + 'pred_results: ', len(pred_results)) print(name + ' ' + 'gold_results:', len(gold_results)) decode_time = time.time() - start_time # print('decode_time:', decode_time) speed = len(instances) / decode_time # 每秒处理的句子数量 acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest and not data.sentence_classification: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def evaluate_batch(self, eva_data): wl = self.args.vocab.wl cl = self.args.vocab.cl batch_size = self.args.batch_size ## set model in eval model self.model.eval() correct_preds = 0. total_preds = 0. total_correct = 0. accs = [] pred_results = [] gold_results = [] for i, (words, label_ids) in enumerate( self.args.vocab.minibatches(eva_data, batch_size=batch_size)): char_ids, word_ids = zip(*words) word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=wl, cthres=cl) char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=wl, cthres=cl) label_ids, _ = seqPAD.pad_sequences(label_ids, pad_tok=0, wthres=wl, cthres=cl) data_tensors = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=True) label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors mask_tensor = word_tensor > 0 label_score = self.model(word_tensor, sequence_lengths, char_tensor, word_lengths, char_seq_recover) label_prob, label_pred = self.model.inference( label_score, mask_tensor) pred_label, gold_label = recover_label(label_pred, label_tensor, mask_tensor, self.args.vocab.l2i, word_seq_recover) pred_results += pred_label gold_results += gold_label acc, p, r, f = get_ner_fmeasure(gold_results, pred_results) # label_pred = label_pred.cpu().data.numpy() # label_tensor = label_tensor.cpu().data.numpy() # sequence_lengths = sequence_lengths.cpu().data.numpy() # # for lab, lab_pred, length in zip(label_tensor, label_pred, sequence_lengths): # lab = lab[:length] # lab_pred = lab_pred[:length] # accs += [a==b for (a, b) in zip(lab, lab_pred)] # # lab_chunks = set(NERchunks.get_chunks(lab, self.args.vocab.l2i)) # lab_pred_chunks = set(NERchunks.get_chunks(lab_pred, self.args.vocab.l2i)) # # correct_preds += len(lab_chunks & lab_pred_chunks) # total_preds += len(lab_pred_chunks) # total_correct += len(lab_chunks) # # p = correct_preds / total_preds if correct_preds > 0 else 0 # r = correct_preds / total_correct if correct_preds > 0 else 0 # f = 2 * p * r / (p + r) if correct_preds > 0 else 0 # acc = np.mean(accs) return acc, f
def evaluate(data, model, name): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) exit(1) gold_results = [] pred_results = [] batch_size = data.batch_size start_time = time.time() train_num = len(instances) # 文档数 total_batch = train_num // batch_size + 1 # 设置 model 为评估模式 model.eval() with torch.no_grad(): # 起始时,清空梯度 for batch_id in range(total_batch): # 取 instances[start:end] start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue # 将数据 instances[start:end] batchify batch_word, batch_wordlen, batch_wordrecover, \ batch_char, batch_charlen, batch_charrecover, batch_label, mask, doc_idx, word_idx \ = batchify_with_label(instance, data.use_gpu, False) mask = mask.eq(1) # 转化为 T/F # 重复多次获得结果,再取平均 p, lstm_out, outs, word_represent = model.MC_sampling( batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, data.nsamples) # 获得每个 token 的预测结果标签在 alphabet 中的 index model1_preds = decode_seq(outs, mask) # 获得每个 token 对应的不确定度 uncertainty = epistemic_uncertainty(p, mask) # 总句子数 × [O O O O] pred_labels, gold_label = recover_label(model1_preds, batch_label, mask, data.label_alphabet, batch_wordrecover) gold_results += gold_label pred_results += pred_labels decode_time = time.time() - start_time speed = train_num / decode_time # 得到测量指标 acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) score = f print( "%s: time: %.2f s, speed: %.2f doc/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f; \n" % (name, decode_time, speed, acc, p, r, f)) # 保存预测结果 if name == 'raw': print("save predicted results to %s" % data.decode_dir) data.convert_doc_to_sent(name) data.write_decoded_results(pred_results, name) return score, pred_results
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print "Error: wrong evaluate name,", name show_nbest = False right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] # total pred result gold_results = [] # total gold result # set model in eval model model.eval() batch_size = data.batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, batch_trans, trans_seq_lengths, trans_seq_recover, mask = batchify_with_label( instance, data.gpu, False) if nbest and nbest >= 2: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest, batch_trans, trans_seq_lengths, trans_seq_recover) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, batch_trans, trans_seq_lengths, trans_seq_recover) # print "tag:", tag_seq pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label # show nbest out instance_count = len(pred_scores) if show_nbest: for i in range(10): x = random.randint(0, instance_count - 1) print('---' * 10) print 'gold: ' + ','.join(gold_results[x]) for j in range(nbest): print '%.8f: ' % (pred_scores[x][j]) + ','.join( nbest_pred_results[x][j]) decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def evaluate(data, model,logger, name,best_dev = -1): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids else: print("Error: wrong evaluate name,", name) exit(1) H2BH_pred_results = [] H2BB_pred_results = [] B2HH_pred_results = [] B2HB_pred_results = [] hgold_results = [] lgold_results = [] ## set modules in eval modules model.eval() batch_size = model.batch_size train_num = len(instances) total_batch = train_num//batch_size+1 for batch_id in range(total_batch): start = batch_id*batch_size end = (batch_id+1)*batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_hlabel,batch_llabel, mask = batchify_sequence_labeling_with_label(instance, args.gpu,args.max_sent_length, False) if args.model == "DUAL": H2BH_tag_seqs, H2BB_tag_seqs, B2HB_tag_seqs, B2HH_tag_seqs = model(batch_word, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) H2BHpred_label, H2BBpred_label, hgold_label, lgold_label = recover_label(H2BH_tag_seqs, H2BB_tag_seqs,batch_hlabel, batch_llabel, mask, data.hlabelset, data.llabelset,batch_wordrecover) B2HHpred_label, B2HBpred_label, _, _ = recover_label(B2HH_tag_seqs, B2HB_tag_seqs,batch_hlabel, batch_llabel, mask, data.hlabelset, data.llabelset,batch_wordrecover) H2BH_pred_results += H2BHpred_label H2BB_pred_results += H2BBpred_label B2HH_pred_results += B2HHpred_label B2HB_pred_results += B2HBpred_label hgold_results += hgold_label lgold_results += lgold_label elif args.model == "H2B": H2BH_tag_seqs, H2BB_tag_seqs, = model(batch_word, batch_wordlen, batch_char, batch_charlen,batch_charrecover, mask) hpred_label, lpred_label, hgold_label, lgold_label = recover_label(H2BH_tag_seqs, H2BB_tag_seqs,batch_hlabel, batch_llabel, mask, data.hlabelset, data.llabelset,batch_wordrecover) H2BH_pred_results += hpred_label H2BB_pred_results += lpred_label hgold_results += hgold_label lgold_results += lgold_label elif args.model == "B2H": B2HB_tag_seqs, B2HH_tag_seqs = model(batch_word, batch_wordlen, batch_char, batch_charlen,batch_charrecover, mask) hpred_label, lpred_label, hgold_label, lgold_label = recover_label(B2HH_tag_seqs, B2HB_tag_seqs, batch_hlabel, batch_llabel, mask, data.hlabelset, data.llabelset, batch_wordrecover) B2HH_pred_results += hpred_label B2HB_pred_results += lpred_label hgold_results += hgold_label lgold_results += lgold_label if args.model == "DUAL": H2BH_evals, H2BB_evals, H2B_evals = get_ner_fmeasure(hgold_results, lgold_results, H2BH_pred_results,H2BB_pred_results) B2HH_evals, B2HB_evals, B2H_evals = get_ner_fmeasure(hgold_results, lgold_results, B2HH_pred_results,B2HB_pred_results) elif args.model == "H2B": H2BH_evals, H2BB_evals, H2B_evals = get_ner_fmeasure(hgold_results, lgold_results, H2BH_pred_results,H2BB_pred_results, ) B2HH_evals, B2HB_evals, B2H_evals = [0, 0, 0, 0], [0, 0, 0], [0, 0, 0] elif args.model == "B2H": H2BH_evals, H2BB_evals, H2B_evals = [0, 0, 0, 0], [0, 0, 0], [0, 0, 0] B2HH_evals, B2HB_evals, B2H_evals = get_ner_fmeasure(hgold_results, lgold_results, B2HH_pred_results,B2HB_pred_results, ) H2B_results = [H2BH_pred_results, H2BB_pred_results] B2H_results = [B2HH_pred_results, B2HB_pred_results] logger.info( "%s --HIGH layer: H2B MODEL acc:%.4f , p: %.4f, r: %.4f, f: %.4f ||||| B2H MODEL acc:%.4f , p: %.4f, r: %.4f, f: %.4f ." % (name.upper(),H2BH_evals[0], H2BH_evals[1], H2BH_evals[2],H2BH_evals[3], B2HH_evals[0], B2HH_evals[1], B2HH_evals[2], B2HH_evals[3])) logger.info( "%s --BOT layer: H2B MODEL p: %.4f, r: %.4f, f: %.4f ||||| B2H MODEL p: %.4f, r: %.4f, f: %.4f ." % (name.upper(),H2BB_evals[0], H2BB_evals[1], H2BB_evals[2], B2HB_evals[0], B2HB_evals[1], B2HB_evals[2])) logger.info( "%s --ALL layer: H2B MODEL p: %.4f, r: %.4f, f: %.4f ||||| B2H MODEL p: %.4f, r: %.4f, f: %.4f .best_f: %.4f" % (name.upper(),H2B_evals[0], H2B_evals[1], H2B_evals[2], B2H_evals[0], B2H_evals[1], B2H_evals[2], best_dev)) print( "%s --ALL layer: H2B MODEL p: %.4f, r: %.4f, f: %.4f ||||| B2H MODEL p: %.4f, r: %.4f, f: %.4f .best_f: %.4f" % (name.upper(),H2B_evals[0], H2B_evals[1], H2B_evals[2], B2H_evals[0], B2H_evals[1], B2H_evals[2], best_dev)) return H2B_evals,B2H_evals, H2B_results,B2H_results
# -*- coding: utf-8 -*-