def conllz_to_offset(sentence_data: Sentence,
                     raise_exception=False,
                     attr_index=0) -> Tuple[Document, bool]:
    decoder = BILUOSequenceEncoderDecoder()

    input_text = sentence_data.word_lines
    tags_seq = sentence_data.get_attribute_by_index(attr_index)

    failed = False
    meta = copy.deepcopy(sentence_data.meta)

    try:
        seq = decoder.to_offset(tags_seq,
                                input_text,
                                label=meta.pop('label', None),
                                id=sentence_data.id,
                                **meta)
    except TagSetDecodeError as e:
        if not raise_exception:
            # invalid tag sequence will raise exception
            # so return a empty result
            seq = Document(input_text)
            failed = True
        else:
            raise

    return seq, failed
Example #2
0
def evaluate_NER_by_conll(input_file,
                          gold_column_index=1,
                          test_column_index=2):
    sentence_list = read_conll(input_file)
    decoder = BILUOSequenceEncoderDecoder()

    gold_tag_list = []
    test_tag_list = []
    for sentence in sentence_list:

        sentence_gold_tag = []
        sentence_test_tag = []
        for item_list in sentence:
            sentence_gold_tag.append(item_list[gold_column_index])
            sentence_test_tag.append(item_list[test_column_index])

        gold_tag_list.append(sentence_gold_tag)
        test_tag_list.append(sentence_test_tag)

    evaluator = OffsetEvaluator()

    for i in range(len(gold_tag_list)):
        gold_tag = gold_tag_list[i]
        test_tag = test_tag_list[i]

        gold_tag_offset = decoder.decode_to_offset(gold_tag)

        print(i)
        test_tag_offset = decoder.decode_to_offset(test_tag)

        evaluator.process_one_batch(gold_tag_offset, test_tag_offset)

    metrics = evaluator.get_score()
    return metrics
def test_is_prefix_legal():
    a = BILUOSequenceEncoderDecoder()
    b = BILUOSequenceEncoderDecoder.is_prefix_legal(a, 'B', 'I')
    assert True == b

    c = BILUOSequenceEncoderDecoder()
    d = BILUOSequenceEncoderDecoder.is_prefix_legal(c, 'F', 'G')
    assert False == d
Example #4
0
    def __init__(self, model_path):
        # load model
        self.place = fluid.CPUPlace()
        self.exe = fluid.Executor(self.place)
        [self.inference_program, self.feed_target_names, self.fetch_targets] = fluid.io.load_inference_model(dirname=model_path, executor=self.exe)

        # load vocabulary
        self.vocabulary = read_vocabulary(os.path.join(model_path, 'data/vocabulary.txt'))
        self.tag = read_vocabulary(os.path.join(model_path, 'data/tags.txt'))

        # prepare tag set decoder
        self.decoder = BILUOSequenceEncoderDecoder()
Example #5
0
    def load(cls, parameter: dict, asset_dir) -> "ProcessorBase":
        from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder

        decoder = BILUOSequenceEncoderDecoder()

        self = cls(decoder)

        return self
def test_parse_tag():
    a = BILUOSequenceEncoderDecoder()
    b = BILUOSequenceEncoderDecoder.parse_tag(a, 'O')
    assert 'O',None == b

    c = BILUOSequenceEncoderDecoder()
    d = BILUOSequenceEncoderDecoder.parse_tag(a, 'B-I')
    assert 'B', 'I' == d

    e = BILUOSequenceEncoderDecoder()
    try:
        f = BILUOSequenceEncoderDecoder.parse_tag(a, 'H-Z')
    except Exception as e:
        assert True == isinstance(e, ValueError)
Example #7
0
class Inference(object):
    def __init__(self, model_path):
        # load model
        self.place = fluid.CPUPlace()
        self.exe = fluid.Executor(self.place)
        [self.inference_program, self.feed_target_names, self.fetch_targets] = fluid.io.load_inference_model(dirname=model_path, executor=self.exe)

        # load vocabulary
        self.vocabulary = read_vocabulary(os.path.join(model_path, 'data/vocabulary.txt'))
        self.tag = read_vocabulary(os.path.join(model_path, 'data/tags.txt'))

        # prepare tag set decoder
        self.decoder = BILUOSequenceEncoderDecoder()

    def infer(self, input_text):
        data = [self.vocabulary.lookup(i) for i in input_text]
        print(data)

        word = fluid.create_lod_tensor([data], [[len(data)]], self.place)

        results, = self.exe.run(
            self.inference_program,
            feed={self.feed_target_names[0]: word},
            fetch_list=self.fetch_targets,
            return_numpy=False
        )

        # translate to str list
        result = np.array(results).reshape([-1]).tolist()
        print(result)
        output_tag = [self.tag.id_to_str(i) for i in result]
        print(output_tag)

        # decode to offset
        result = self.decoder.to_offset(output_tag, input_text)

        return result
Example #8
0
import sys

from flask import Flask, request, jsonify
from flask_cors import CORS
from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder
from hanzi_char_lookup_feature import load_trie_from_files, generate_lookup_feature

decoder = BILUOSequenceEncoderDecoder()

app = Flask(__name__)
app.config['JSON_AS_ASCII'] = False
# app.config['DEBUG'] = True
CORS(app)

from tensorflow.contrib import predictor

predict_fn = None

t = None


def load_t(t_data):
    global t
    t = load_trie_from_files({'person': [t_data]})


def load_predict_fn(export_dir):
    global predict_fn
    predict_fn = predictor.from_saved_model(export_dir)

    return predict_fn
    def process(self, message: Message, **kwargs: Any) -> None:
        from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder
        from tokenizer_tools.tagset.offset.sequence import Sequence

        decoder = BILUOSequenceEncoderDecoder()

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        print(real_result_dir)

        input_text = message.text

        input_feature = {
            'words': [[i for i in input_text]],
            'words_len': [len(input_text)],
        }

        print(input_feature)

        predictions = self.predict_fn(input_feature)
        tags = predictions['tags'][0]
        # print(predictions['tags'])

        # decode Unicode
        tags_seq = [i.decode() for i in tags]

        print(tags_seq)

        # BILUO to offset
        failed = False
        try:
            seq = decoder.to_offset(tags_seq, input_text)
        except Exception as e:
            # invalid tag sequence will raise exception
            # so return a empty result
            logger.error("Decode error: {}".format(e))
            seq = Sequence(input_text)
            failed = True
        # print(seq)

        print(seq, tags_seq, failed)

        entity_set = []

        seq.span_set.fill_text(input_text)

        for span in seq.span_set:
            ent = {
                "entity": span.entity,
                "value": span.value,
                "start": span.start,
                "confidence": None,
                "end": span.end
            }

            entity_set.append(ent)

        extracted = self.add_extractor_name(entity_set)

        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
def test_to_offset():
    s = BILUOSequenceEncoderDecoder()
    rs = BILUOSequenceEncoderDecoder.to_offset(s, ['B-I'], 'sadines')
    print(rs)
def test_decode_to_offset():
    sequence = ['B-L']
    s = BILUOSequenceEncoderDecoder()
    rs = BILUOSequenceEncoderDecoder.decode_to_offset(s, sequence)
    assert len(rs) == 0
Example #12
0
from tensorflow.contrib import predictor

from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder

decoder = BILUOSequenceEncoderDecoder()

export_dir = 'results/saved_model/1542732555'

predict_fn = predictor.from_saved_model(export_dir)

text_msg = "王小明在北京的情话大学读书。"

predictions = predict_fn(
    {
        'words': [[i for i in text_msg]],
        'words_len': [len(text_msg)]
    }
)
print(predictions['tags'])

tags_seq = [i.decode() for i in predictions['tags'][0]]

word_list = decoder.decode_to_offset(tags_seq)
print(word_list)