def test(args): """ Testing the model Args: args: dict that contains options in command """ sent_vocab = Vocab.load(args.SENT_VOCAB) tag_vocab = Vocab.load(args.TAG_VOCAB) sentences, tags = utils.read_corpus(args.TEST) sentences = utils.words2indices(sentences, sent_vocab) tags = utils.words2indices(tags, tag_vocab) test_data = list(zip(sentences, tags)) print('num of test samples: %d' % (len(test_data))) # device = torch.device('cuda' if args.cuda else 'cpu') device = torch.device('cuda:0') model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device) print('start testing...') print('using device', device) start = time.time() n_iter, num_words = 0, 0 tp, fp, fn = 0, 0, 0 model.eval() with torch.no_grad(): for sentences, tags in utils.batch_iter(test_data, batch_size=int( args.batch_size), shuffle=False): sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(sentences, sent_lengths) n_iter += 1 num_words += sum(sent_lengths) for tag, predicted_tag in zip(tags, predicted_tags): current_tp, current_fp, current_fn = cal_statistics( tag, predicted_tag, tag_vocab) tp += current_tp fp += current_fp fn += current_fn if n_iter % int(args.log_every) == 0: print( 'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec' % (n_iter, num_words / (time.time() - start), tp / (tp + fp), tp / (tp + fn), (2 * tp) / (2 * tp + fp + fn), time.time() - start)) num_words = 0 start = time.time() print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn)) precision = tp / (tp + fp) recall = tp / (tp + fn) f1_score = (2 * tp) / (2 * tp + fp + fn) print('Precision: %f, Recall: %f, F1 score: %f' % (precision, recall, f1_score))
def test(args): """ Testing the model Args: args: dict that contains options in command """ sent_vocab = Vocab.load(args['SENT_VOCAB']) tag_vocab = Vocab.load(args['TAG_VOCAB']) sentences, tags = utils.read_corpus(args['TEST']) sentences = utils.words2indices(sentences, sent_vocab) tags = utils.words2indices(tags, tag_vocab) test_data = list(zip(sentences, tags)) print('num of test samples: %d' % (len(test_data))) device = torch.device('cuda' if args['--cuda'] else 'cpu') model = bilstm_crf.BiLSTMCRF.load(args['MODEL'], device) print('start testing...') print('using device', device) result_file = open(args['RESULT'], 'w') model.eval() with torch.no_grad(): for sentences, tags in utils.batch_iter(test_data, batch_size=int( args['--batch-size']), shuffle=False): padded_sentences, sent_lengths = utils.pad( sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(padded_sentences, sent_lengths) for sent, true_tags, pred_tags in zip(sentences, tags, predicted_tags): sent, true_tags, pred_tags = sent[1:-1], true_tags[ 1:-1], pred_tags[1:-1] for token, true_tag, pred_tag in zip(sent, true_tags, pred_tags): result_file.write(' '.join([ sent_vocab.id2word(token), tag_vocab.id2word(true_tag), tag_vocab.id2word(pred_tag) ]) + '\n') result_file.write('\n')
def main(args): device = torch.device('cpu') model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device) text = dataEdit.get_lines(args.sample_data) morph_lines = dataEdit.make_morphs(text) lines = sorted(morph_lines, key=lambda x: len(x), reverse=True) sent_vocab = Vocab.load(args.SENT_VOCAB) tag_vocab = Vocab.load(args.TAG_VOCAB) sentences = utils.words2indices(lines, sent_vocab) for sentences in batch_iter(sentences, 64, shuffle=False): sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(sentences, sent_lengths) tags = utils.indices2words(predicted_tags, tag_vocab) sent = [] for line in range(len(sentences)): word = [] for i in range(len(sentences[line])): w = sent_vocab.id2word(sentences[line][i]) if w == '<PAD>': continue if w == '<UNK>': w = lines[line][i] word.append(w) sent.append(word) # print(sent) answer = [] for word in sent: result = "" for w in word: result += w answer.append(result) answer, tag_list = compare_sentence(text, answer, tags) for i in range(len(answer)): temp = answer[i] for j in range(len(morph_lines[i])): if tag_list[i][j] == '-': # temp += answer[i][:answer[i].find(morph_lines[i][j])] + answer[i][answer[i].find(morph_lines[i][j]):] continue else: temp = (temp[:temp.find(morph_lines[i][j])] + '<' + tag_list[i][j].split('_')[0] + '>' + temp[temp.find(morph_lines[i][j]):]) print(temp)
def test(args, weights_matrix): """ Testing the model Args: args: dict that contains options in command """ sent_vocab = Vocab.load(args['SENT_VOCAB']) tag_vocab = Vocab.load(args['TAG_VOCAB_NER']) sentences, tags = utils.read_corpus(args['TEST']) sentences = utils.words2indices(sentences, sent_vocab) # Method method = args['METHOD'] # # Convert to binary tags (if there is a tag or not) tags_entity = utils.entity_or_not(tags) # Convert from IOBES to IOB tags = iobes_iob(tags) tags = utils.words2indices(tags, tag_vocab) test_data = list(zip(sentences, tags, tags_entity)) print('num of test samples: %d' % (len(test_data))) device = torch.device('cuda' if args['--cuda'] else 'cpu') model = bilstm_crf.BiLSTMCRF.load(weights_matrix, args['MODEL'], device) print('start testing...') print('using device', device) start = time.time() n_iter, num_words = 0, 0 tp, fp, fn = 0, 0, 0 model.eval() with torch.no_grad(): for sentences, tags, tags_entity in utils.batch_iter( test_data, batch_size=int(args['--batch-size']), shuffle=False): sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(sentences, sent_lengths, method) n_iter += 1 num_words += sum(sent_lengths) for tag, predicted_tag in zip(tags, predicted_tags): current_tp, current_fp, current_fn = cal_statistics( tag, predicted_tag, tag_vocab) tp += current_tp fp += current_fp fn += current_fn if n_iter % int(args['--log-every']) == 0: print( 'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec' % (n_iter, num_words / (time.time() - start), tp / (tp + fp), tp / (tp + fn), (2 * tp) / (2 * tp + fp + fn), time.time() - start)) num_words = 0 start = time.time() print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn)) precision = tp / (tp + fp) recall = tp / (tp + fn) f1_score = (2 * tp) / (2 * tp + fp + fn) print('Precision: %f, Recall: %f, F1 score: %f' % (precision, recall, f1_score))
def print_line(line, model_path='./model/model.pth', sent_vocab_path='./vocab/sent_vocab.json', tag_vocab_path='./vocab/tag_vocab.json'): """ :param line: text line :param model_path: model path :param sent_vocab_path: sentence vocab path :param tag_vocab_path: tag vocab path :return: print text and tag """ device = torch.device('cuda:0') sent_vocab = Vocab.load(sent_vocab_path) tag_vocab = Vocab.load(tag_vocab_path) model = bilstm_crf.BiLSTMCRF.load(model_path, device) morph_line, tags = dataEdit.make_morph_tag(line) sentences = utils.words2indices([morph_line], sent_vocab) sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device) predicted_tags = model.predict(sentences, sent_lengths) tags = utils.indices2words(predicted_tags, tag_vocab) print(line) OG = [] PS = [] DT = [] LC = [] TI = [] OG_i = PS_i = DT_i = LC_i = TI_i = 0 before_tag = None for word, tag in zip(morph_line, tags[0]): if tag != '-' and tag != '<START>' and tag != '<END>' and tag != '<PAD>': if tag.split('_')[ 1] == 'I' and before_tag is not None and before_tag.split( '_')[1] == 'B': if tag.split('_')[0] == 'OG': OG[OG_i - 1] = OG[OG_i - 1] + word elif tag.split('_')[0] == 'PS': PS[PS_i - 1] = PS[PS_i - 1] + word elif tag.split('_')[0] == 'DT': DT[DT_i - 1] = DT[DT_i - 1] + word elif tag.split('_')[0] == 'LC': LC[LC_i - 1] = LC[LC_i - 1] + word elif tag.split('_')[0] == 'TI': TI[TI_i - 1] = TI[TI_i - 1] + word elif tag.split('_')[1] == 'B': if tag.split('_')[0] == 'OG': OG.append(word) OG_i += 1 elif tag.split('_')[0] == 'PS': PS.append(word) PS_i += 1 elif tag.split('_')[0] == 'DT': DT.append(word) DT_i += 1 elif tag.split('_')[0] == 'LC': LC.append(word) LC_i += 1 elif tag.split('_')[0] == 'TI': TI[TI_i] = word TI_i += 1 before_tag = tag print('OG: ', end='') print(OG) print('PS: ', end='') print(PS) print('DT: ', end='') print(DT) print('LC: ', end='') print(LC) print('TI: ', end='') print(TI) return