Python words2indicesの例、utils.words2indices Pythonの例

コード例 #1

0

ファイルを表示

def test(args):
    """ Testing the model
    Args:
        args: dict that contains options in command
    """
    sent_vocab = Vocab.load(args.SENT_VOCAB)
    tag_vocab = Vocab.load(args.TAG_VOCAB)
    sentences, tags = utils.read_corpus(args.TEST)
    sentences = utils.words2indices(sentences, sent_vocab)
    tags = utils.words2indices(tags, tag_vocab)
    test_data = list(zip(sentences, tags))
    print('num of test samples: %d' % (len(test_data)))

    # device = torch.device('cuda' if args.cuda else 'cpu')
    device = torch.device('cuda:0')
    model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device)
    print('start testing...')
    print('using device', device)

    start = time.time()
    n_iter, num_words = 0, 0
    tp, fp, fn = 0, 0, 0

    model.eval()
    with torch.no_grad():
        for sentences, tags in utils.batch_iter(test_data,
                                                batch_size=int(
                                                    args.batch_size),
                                                shuffle=False):
            sentences, sent_lengths = utils.pad(sentences,
                                                sent_vocab[sent_vocab.PAD],
                                                device)
            predicted_tags = model.predict(sentences, sent_lengths)
            n_iter += 1
            num_words += sum(sent_lengths)
            for tag, predicted_tag in zip(tags, predicted_tags):
                current_tp, current_fp, current_fn = cal_statistics(
                    tag, predicted_tag, tag_vocab)
                tp += current_tp
                fp += current_fp
                fn += current_fn
            if n_iter % int(args.log_every) == 0:
                print(
                    'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec'
                    % (n_iter, num_words / (time.time() - start), tp /
                       (tp + fp), tp / (tp + fn),
                       (2 * tp) / (2 * tp + fp + fn), time.time() - start))
                num_words = 0
                start = time.time()
    print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    print('Precision: %f, Recall: %f, F1 score: %f' %
          (precision, recall, f1_score))

コード例 #2

0

ファイルを表示

ファイル: run.py プロジェクト: gjt9274/BiLSTM-CRF

def test(args):
    """ Testing the model
    Args:
        args: dict that contains options in command
    """
    sent_vocab = Vocab.load(args['SENT_VOCAB'])
    tag_vocab = Vocab.load(args['TAG_VOCAB'])
    sentences, tags = utils.read_corpus(args['TEST'])
    sentences = utils.words2indices(sentences, sent_vocab)
    tags = utils.words2indices(tags, tag_vocab)
    test_data = list(zip(sentences, tags))
    print('num of test samples: %d' % (len(test_data)))

    device = torch.device('cuda' if args['--cuda'] else 'cpu')
    model = bilstm_crf.BiLSTMCRF.load(args['MODEL'], device)
    print('start testing...')
    print('using device', device)

    result_file = open(args['RESULT'], 'w')
    model.eval()
    with torch.no_grad():
        for sentences, tags in utils.batch_iter(test_data,
                                                batch_size=int(
                                                    args['--batch-size']),
                                                shuffle=False):
            padded_sentences, sent_lengths = utils.pad(
                sentences, sent_vocab[sent_vocab.PAD], device)
            predicted_tags = model.predict(padded_sentences, sent_lengths)
            for sent, true_tags, pred_tags in zip(sentences, tags,
                                                  predicted_tags):
                sent, true_tags, pred_tags = sent[1:-1], true_tags[
                    1:-1], pred_tags[1:-1]
                for token, true_tag, pred_tag in zip(sent, true_tags,
                                                     pred_tags):
                    result_file.write(' '.join([
                        sent_vocab.id2word(token),
                        tag_vocab.id2word(true_tag),
                        tag_vocab.id2word(pred_tag)
                    ]) + '\n')
                result_file.write('\n')

コード例 #3

0

ファイルを表示

ファイル: sample.py プロジェクト: wisenutgolead/NER_project

def main(args):
    device = torch.device('cpu')
    model = bilstm_crf.BiLSTMCRF.load(args.MODEL, device)

    text = dataEdit.get_lines(args.sample_data)
    morph_lines = dataEdit.make_morphs(text)

    lines = sorted(morph_lines, key=lambda x: len(x), reverse=True)

    sent_vocab = Vocab.load(args.SENT_VOCAB)
    tag_vocab = Vocab.load(args.TAG_VOCAB)
    sentences = utils.words2indices(lines, sent_vocab)

    for sentences in batch_iter(sentences, 64, shuffle=False):
        sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD], device)
        predicted_tags = model.predict(sentences, sent_lengths)

    tags = utils.indices2words(predicted_tags, tag_vocab)

    sent = []
    for line in range(len(sentences)):
        word = []
        for i in range(len(sentences[line])):
            w = sent_vocab.id2word(sentences[line][i])
            if w == '<PAD>':
                continue
            if w == '<UNK>':
                w = lines[line][i]
            word.append(w)
        sent.append(word)

    # print(sent)
    answer = []
    for word in sent:
        result = ""
        for w in word:
            result += w
        answer.append(result)
    answer, tag_list = compare_sentence(text, answer, tags)

    for i in range(len(answer)):
        temp = answer[i]

        for j in range(len(morph_lines[i])):
            if tag_list[i][j] == '-':
                # temp += answer[i][:answer[i].find(morph_lines[i][j])] + answer[i][answer[i].find(morph_lines[i][j]):]
                continue
            else:
                temp = (temp[:temp.find(morph_lines[i][j])] + '<' + tag_list[i][j].split('_')[0] + '>' + temp[temp.find(morph_lines[i][j]):])
        print(temp)

コード例 #4

0

ファイルを表示

ファイル: run.py プロジェクト: nitinkarolla/zero-resource

def test(args, weights_matrix):
    """ Testing the model
    Args:
        args: dict that contains options in command
    """
    sent_vocab = Vocab.load(args['SENT_VOCAB'])
    tag_vocab = Vocab.load(args['TAG_VOCAB_NER'])
    sentences, tags = utils.read_corpus(args['TEST'])
    sentences = utils.words2indices(sentences, sent_vocab)

    # Method
    method = args['METHOD']

    # # Convert to binary tags (if there is a tag or not)
    tags_entity = utils.entity_or_not(tags)

    # Convert from IOBES to IOB
    tags = iobes_iob(tags)

    tags = utils.words2indices(tags, tag_vocab)
    test_data = list(zip(sentences, tags, tags_entity))
    print('num of test samples: %d' % (len(test_data)))

    device = torch.device('cuda' if args['--cuda'] else 'cpu')
    model = bilstm_crf.BiLSTMCRF.load(weights_matrix, args['MODEL'], device)
    print('start testing...')
    print('using device', device)

    start = time.time()
    n_iter, num_words = 0, 0
    tp, fp, fn = 0, 0, 0

    model.eval()
    with torch.no_grad():
        for sentences, tags, tags_entity in utils.batch_iter(
                test_data, batch_size=int(args['--batch-size']),
                shuffle=False):
            sentences, sent_lengths = utils.pad(sentences,
                                                sent_vocab[sent_vocab.PAD],
                                                device)
            predicted_tags = model.predict(sentences, sent_lengths, method)
            n_iter += 1
            num_words += sum(sent_lengths)
            for tag, predicted_tag in zip(tags, predicted_tags):
                current_tp, current_fp, current_fn = cal_statistics(
                    tag, predicted_tag, tag_vocab)
                tp += current_tp
                fp += current_fp
                fn += current_fn
            if n_iter % int(args['--log-every']) == 0:
                print(
                    'log: iter %d, %.1f words/sec, precision %f, recall %f, f1_score %f, time %.1f sec'
                    % (n_iter, num_words / (time.time() - start), tp /
                       (tp + fp), tp / (tp + fn),
                       (2 * tp) / (2 * tp + fp + fn), time.time() - start))
                num_words = 0
                start = time.time()
    print('tp = %d, fp = %d, fn = %d' % (tp, fp, fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = (2 * tp) / (2 * tp + fp + fn)
    print('Precision: %f, Recall: %f, F1 score: %f' %
          (precision, recall, f1_score))

コード例 #5

0

ファイルを表示

ファイル: testing.py プロジェクト: wisenutgolead/NER_project

def print_line(line,
               model_path='./model/model.pth',
               sent_vocab_path='./vocab/sent_vocab.json',
               tag_vocab_path='./vocab/tag_vocab.json'):
    """
    :param line: text line
    :param model_path: model path
    :param sent_vocab_path: sentence vocab path
    :param tag_vocab_path: tag vocab path
    :return: print text and tag
    """
    device = torch.device('cuda:0')
    sent_vocab = Vocab.load(sent_vocab_path)
    tag_vocab = Vocab.load(tag_vocab_path)
    model = bilstm_crf.BiLSTMCRF.load(model_path, device)

    morph_line, tags = dataEdit.make_morph_tag(line)

    sentences = utils.words2indices([morph_line], sent_vocab)
    sentences, sent_lengths = utils.pad(sentences, sent_vocab[sent_vocab.PAD],
                                        device)
    predicted_tags = model.predict(sentences, sent_lengths)
    tags = utils.indices2words(predicted_tags, tag_vocab)

    print(line)
    OG = []
    PS = []
    DT = []
    LC = []
    TI = []
    OG_i = PS_i = DT_i = LC_i = TI_i = 0
    before_tag = None
    for word, tag in zip(morph_line, tags[0]):
        if tag != '-' and tag != '<START>' and tag != '<END>' and tag != '<PAD>':
            if tag.split('_')[
                    1] == 'I' and before_tag is not None and before_tag.split(
                        '_')[1] == 'B':
                if tag.split('_')[0] == 'OG':
                    OG[OG_i - 1] = OG[OG_i - 1] + word
                elif tag.split('_')[0] == 'PS':
                    PS[PS_i - 1] = PS[PS_i - 1] + word
                elif tag.split('_')[0] == 'DT':
                    DT[DT_i - 1] = DT[DT_i - 1] + word
                elif tag.split('_')[0] == 'LC':
                    LC[LC_i - 1] = LC[LC_i - 1] + word
                elif tag.split('_')[0] == 'TI':
                    TI[TI_i - 1] = TI[TI_i - 1] + word
            elif tag.split('_')[1] == 'B':
                if tag.split('_')[0] == 'OG':
                    OG.append(word)
                    OG_i += 1
                elif tag.split('_')[0] == 'PS':
                    PS.append(word)
                    PS_i += 1
                elif tag.split('_')[0] == 'DT':
                    DT.append(word)
                    DT_i += 1
                elif tag.split('_')[0] == 'LC':
                    LC.append(word)
                    LC_i += 1
                elif tag.split('_')[0] == 'TI':
                    TI[TI_i] = word
                    TI_i += 1
        before_tag = tag

    print('OG: ', end='')
    print(OG)
    print('PS: ', end='')
    print(PS)
    print('DT: ', end='')
    print(DT)
    print('LC: ', end='')
    print(LC)
    print('TI: ', end='')
    print(TI)

    return