def test_encode_word_with_addition_m_tag_num(addition_m_tag_num, word,
                                             gold_tags):
    bmes_encoder_decoder = BMESEncoderDecoder(addition_m_tag_num)

    test_tags = bmes_encoder_decoder.encode_word(word)

    pytest.helpers.assert_sequence_equals(test_tags, gold_tags)
Example #2
0
def evaluate_token_by_conll(input_file,
                            gold_column_index=1,
                            test_column_index=2):
    sentence_list = read_conll(input_file)
    decoder = BMESEncoderDecoder()

    gold_tag_list = []
    test_tag_list = []
    for sentence in sentence_list:

        sentence_gold_tag = []
        sentence_test_tag = []
        for item_list in sentence:
            sentence_gold_tag.append(item_list[gold_column_index])
            sentence_test_tag.append(item_list[test_column_index])

        gold_tag_list.append(sentence_gold_tag)
        test_tag_list.append(sentence_test_tag)

    evaluator = OffsetEvaluator()

    for i in range(len(gold_tag_list)):
        gold_tag = gold_tag_list[i]
        test_tag = test_tag_list[i]

        gold_tag_offset = decoder.decode_tag(gold_tag)

        print(i)
        test_tag_offset = decoder.decode_tag(test_tag)

        evaluator.process_one_batch(gold_tag_offset, test_tag_offset)

    metrics = evaluator.get_score()
    return metrics
Example #3
0
def evaluate_by_files_at_tag_level(test_file, gold_file):
    with open(test_file) as fd:
        test_line_list = fd.readlines()

    with open(gold_file) as fd:
        gold_line_list = fd.readlines()

    encoder = BMESEncoderDecoder()
    tag_evaluator = TagEvaluator()

    test_content = ' '.join([i.strip() for i in test_line_list])
    gold_content = ' '.join([i.strip() for i in gold_line_list])

    test_word_list = test_content.split()
    gold_word_list = gold_content.split()

    test_tags = encoder.encode_word_list_as_string(test_word_list)
    gold_tags = encoder.encode_word_list_as_string(gold_word_list)

    tag_evaluator.process_one_batch(gold_tags, test_tags)

    metrics = tag_evaluator.get_score()

    return metrics
Example #4
0
import copy
from functools import reduce

import pycrfsuite
from tokenizer_tools.tagset.BMES import BMESEncoderDecoder

tag_encoder_decoder = BMESEncoderDecoder()


class CRFTrainer:
    _default_params = {
        'c1': 0.1,  # coefficient for L1 penalty
        'c2': 0.01,  # coefficient for L2 penalty
        'max_iterations': 200,  # stop earlier

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True
    }

    def __init__(self, char2feature_func=None):
        self.crf_trainer = pycrfsuite.Trainer(verbose=False)

        if not char2feature_func:
            self.char2feature_func = self._default_word2features

    def train_one_raw_line(self, blank_splittable_string):
        token_list = blank_splittable_string.split()

        self.train_one_line_by_token(token_list)

    def train_one_line_by_char_tag(self, char_list, tag_list):
Example #5
0
import numpy as np
import paddle.fluid as fluid
from tokenizer_tools.tagset.BMES import BMESEncoderDecoder

from paddle_tokenizer.data_reader import read_vocabulary

exe = fluid.Executor(fluid.CPUPlace())
path = "./test.inference.model"

[inference_program, feed_target_names,
 fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe)

place = fluid.CPUPlace()

decoder = BMESEncoderDecoder()

vocabulary = read_vocabulary("data/unicode_char_list.txt")
reverse_vocabulary = {v: k for k, v in vocabulary.items()}

tag = read_vocabulary("data/tags.txt")
reverse_tag = {v: k for k, v in tag.items()}


def infer(data):
    word = fluid.create_lod_tensor([data], [[len(data)]], place)

    results, = exe.run(
        inference_program,
        feed={feed_target_names[0]: word},
        fetch_list=fetch_targets,
        return_numpy=False,
Example #6
0
from __future__ import print_function

import json
import requests

from tokenizer_tools.tagset.BMES import BMESEncoderDecoder

bmes_decoder = BMESEncoderDecoder()


class HTTPClient(object):
    def __init__(self, host, model_name='seq2label', port=8501, https=False):
        self.server_url = '{protocol}://{host}:{port}/v1/models/{model_name}:predict'.format(
            protocol='https' if https else 'http',
            host=host,
            port=port,
            model_name=model_name
        )

    def segment(self, input_str):
        # Compose a JSON Predict request (send JPEG image in base64).
        request_object = {
            "instances":
                [
                    {
                        "words": [i for i in input_str],
                        "words_len": len(input_str)
                    },
                ]
        }
        predict_request = json.dumps(request_object)
def test_decode_char_tag_pair(char_tag_pair, gold_word_list):
    bmes_encoder_decoder = BMESEncoderDecoder()

    test_word_list = bmes_encoder_decoder.decode_char_tag_pair(char_tag_pair)

    pytest.helpers.assert_sequence_equals(test_word_list, gold_word_list)
def test_encode_word_list_as_string(word_list, gold_str):
    bmes_encoder_decoder = BMESEncoderDecoder()

    test_str = bmes_encoder_decoder.encode_word_list_as_string(word_list)

    pytest.helpers.assert_sequence_equals(test_str, gold_str)
def test_encode_word_list(word_list, gold_tags_list):
    bmes_encoder_decoder = BMESEncoderDecoder()

    test_tags_list = bmes_encoder_decoder.encode_word_list(word_list)

    pytest.helpers.assert_sequence_equals(test_tags_list, gold_tags_list)
def test_encode_word(input_word, output_tags):
    bmes_encoder_decoder = BMESEncoderDecoder()

    tags = bmes_encoder_decoder.encode_word(input_word)

    pytest.helpers.assert_sequence_equals(tags, output_tags)