Exemple #1
0
def create_new_corpus(data_dict, corpus_vol, **kwargs):
    new_corpus = Corpus([])
    sem_nums = kwargs['sem_nums']
    intents = data_dict.keys()
    if not corpus_vol:
        return
    elif sem_nums > len(intents):
        return
    else:
        for i in range(corpus_vol):
            intent_sam = set()
            while len(intent_sam) < sem_nums:
                intent_sam.add(random.choice(list(intents)))
            spanset = SpanSet()
            sentences = []
            start_position = 0
            for intent in list(intent_sam):
                if intent == 'noise':
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    start_position += len(txt)
                else:
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    spanset.append(
                        Span(start=start_position,
                             end=start_position + len(txt),
                             entity=intent))
                    start_position += len(txt)
            doc = Document(text=''.join(sentences),
                           label='|'.join(intent_sam),
                           span_set=spanset)
            new_corpus.append(doc)

    return new_corpus
Exemple #2
0
def two_add_link(map_data, file1, file2, link, domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    link_list = read_raw_data(link)
    len_all = max(len(list1), len(list2))
    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        l3 = choice(link_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)

        l = l1 + l3 + l2
        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + l3),
                 end=len(l1 + l3 + l2end),
                 entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

        ll = l2 + l3 + l1
        doc2 = Document(ll)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l2end), entity=path2[:-4]),
            Span(start=len(l2 + l3),
                 end=len(l2 + l3 + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        # print(doc1)
        doc_list.append(doc2)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx"
    corpus.write_to_file(res_path)
def test_tensorflow_inference(datadir):
    # TODO(howl-anderson): skip this test until model file oversize issue solved
    return

    workshop_dir = datadir

    model_dir = os.path.join(workshop_dir, "./saved_model")

    inference = TensorFlowInference(model_dir)
    result = inference.infer("看一下上海的天气。")

    print(result)

    expected = (
        "看一下上海的天气。",
        Sequence(
            text=["看", "一", "下", "上", "海", "的", "天", "气", "。"],
            span_set=SpanSet([Span(3, 5, "地点", value=None,
                                   normal_value=None)]),
            id=None,
            label=None,
            extra_attr={},
        ),
        ["O", "O", "O", "B-地点", "L-地点", "O", "O", "O", "O"],
        False,
    )

    assert expected == result
Exemple #4
0
def test_tensorflow_keras_h5_inference(datadir):
    workshop_dir = datadir

    model_file = os.path.join(workshop_dir, "./h5_model/model.h5")
    tag_lookup_file = os.path.join(workshop_dir,
                                   "./h5_model/tag_lookup_table.json")
    vocabulary_lookup_file = os.path.join(
        workshop_dir, "./h5_model/vocabulary_lookup_table.json")

    inference = Inference(model_file, tag_lookup_file, vocabulary_lookup_file)
    result = inference.infer("看一下上海的天气。")

    expected = (
        "看一下上海的天气。",
        Sequence(
            text=["看", "一", "下", "上", "海", "的", "天", "气", "。"],
            span_set=SpanSet(
                [Span(3, 5, "城市名", value=None, normal_value=None)]),
            id=None,
            label=None,
            extra_attr={},
        ),
        ["O", "O", "O", "B-城市名", "L-城市名", "O", "O", "O", "O"],
        False,
    )

    assert expected == result
Exemple #5
0
    def __init__(
        self,
        text: Union[List[str], str],
        span_set: SpanSet = None,
        id: Union[str, None] = None,
        label: Union[str, None] = None,
        extra_attr: Union[Mapping[str, Any], None] = None,
    ):
        # TODO:
        #   1. rename extra_attr to attr
        #   2. move label into attr
        #   3. span_set should include in some column data

        # convert text from string to list, if needed
        if isinstance(text, str):
            text = list(i for i in text)

        self.text = text
        self._span_set = None
        self.span_set = span_set or SpanSet()
        self.id = id if id is not None else str(uuid.uuid4())
        self.label = label  # for feature usage
        self.extra_attr = extra_attr if extra_attr else {}

        self._compare_method = None
        self._hash_method = None
def test_span_set_bind():
    seq = Sequence("abce", span_set=SpanSet([Span(1, 2, '1-2')]))

    result = seq.span_set[0].value
    expected = ["b"]

    assert result == expected
Exemple #7
0
    def _turn_training_data_to_offset(training_data):
        from tokenizer_tools.tagset.offset.sequence import Sequence
        from tokenizer_tools.tagset.offset.span import Span
        from tokenizer_tools.tagset.offset.span_set import SpanSet

        for example in training_data.training_examples:
            span_set = SpanSet()

            text = [i for i in example.text]  # need to be str list (not str)
            intent = example.get("intent")

            for ent in example.get("entities", []):
                start, end, entity = ent["start"], ent["end"], ent["entity"]

                span_set.append(Span(start, end, entity))

            seq = Sequence(text, span_set, label=intent)

            yield seq
Exemple #8
0
def test_difference(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    result = corpus_one.difference(corpus_two)
    expected = Corpus([
        Document(
            "王小明在台北新竹的清华大学读书。",
            span_set=SpanSet(
                [Span(0, 3, "PERSON"),
                 Span(4, 8, "GPE"),
                 Span(9, 13, "ORG")]),
            id="3",
        )
    ])

    assert result == expected
Exemple #9
0
def one_add_noise(map_data, file1, noise, domain, pos=''):
    list1 = read_raw_data(file1)
    noise_list = read_raw_data(noise)
    len_all = max(len(list1), len(noise_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(noise_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        if pos == 'before':
            l = l2 + l1
            span_list1 = [
                Span(start=len(l2), end=len(l1end + l2), entity=path1[:-4]),
            ]
        elif pos == 'after':
            l = l1 + l2
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]
        else:
            l = l1
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]

        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]

        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + 'noise' + '_' + pos + ".conllx"
    corpus.write_to_file(res_path)
Exemple #10
0
def one_to_conllx(map_data, file1, domain):
    list1 = read_raw_data(file1)
    path1 = os.path.basename(file1)
    dict_list = read_map(map_data)
    doc_list = []
    # 数量min+max
    for i in list1:
        doc1 = Document(i)
        doc1.domain = domain
        # print(dict_list[path1[:-4]])
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        lenx = line_end_remove(i)
        span_list1 = [
            Span(start=0, end=len(lenx), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)
def read_data(data_path, output_file):
    data_path = Path(data_path)

    doc_list = []

    for data_file in data_path.glob("*.json"):
        with jsonlines.open(str(data_file)) as reader:
            for obj in reader:
                text = [i for i in obj["content"]]
                doc = Document(text)
                doc.sub_function = obj["childFunction"]
                doc.domain = obj["domain"]
                doc.function = obj["function"]
                doc.intent = obj["intent"]

                span_list = []
                for entity in obj["marked"]:
                    record = entity["record"]
                    if not record:
                        continue

                    start = int(record[0])
                    end = int(record[-1]) + 1
                    entity_type = entity["titleIndex"]

                    span = Span(start, end, entity_type)

                    span_list.append(span)

                entities = SpanSet(span_list)

                doc.entities = entities

                doc_list.append(doc)

    corpus = Corpus(doc_list)
    corpus.write_to_file(output_file)
Exemple #12
0
def test_document_pattern():
    # construct a pattern object
    dp = DocumentPattern("name 的 goods".split())

    dp.entities = SpanSet([
        EntityPlaceholder(start=0, end=1, entity="name"),
        EntityPlaceholder(start=2, end=3, entity="goods"),
    ])

    # test if render method works

    doc = dp.render(name=["Real", "Name"], goods=["RealGoods"])

    expected_doc_snippet = "[Real Name](name) 的 [RealGoods](goods)"
    result_doc = str(doc)

    assert expected_doc_snippet in result_doc

    # make sure DocumentPattern is untouched

    expected_pattern_snippet = "<name> 的 <goods>"
    result_pattern = str(dp)

    assert expected_pattern_snippet in result_pattern
def test_check_overlap():
    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity'))
    span_set.append(Span(2, 3, 'entity'))
    assert span_set.check_overlap()[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity'))
    span_set.append(Span(4, 6, 'entity'))
    assert span_set.check_overlap()[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity'))
    span_set.append(Span(2, 3, 'entity'))
    check_result = span_set.check_overlap()
    assert check_result[0] == False
    assert check_result[1] == [(Span(1, 4, 'entity'), Span(2, 3, 'entity'))]
Exemple #14
0
def one_before_after_add_noise(map_data, file1, before_noise, after_noise,
                               domain):
    list1 = read_raw_data(file1)
    before_list = read_raw_data(before_noise)
    after_list = read_raw_data(after_noise)
    len_all = max(len(list1), len(before_list), len(after_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        before = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1
        len1 = l1
        doc1 = Document(len1)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # before + file1
        len2 = before + l1
        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list2 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + after
        len3 = before + l1 + after
        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # file1 + after
        len4 = l1 + after
        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list4 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)
corpus = Corpus.read_from_file("./data/all_data.conllx")

list1 = []
for doc in corpus:
    list1.append(doc)

len_all = len(list1)
doc_list = []
for i in range(0, len_all):
    l1 = choice(list1)
    len1 = len(l1.text)
    span_list = []
    for span in l1.span_set:
        span_list.append(span)

    l2 = choice(list1)
    for span in l2.span_set:
        span_ll = Span(start=len1 + span.start,
                       end=len1 + span.end,
                       entity=span.entity)
        span_list.append(span_ll)

    text = "".join(l1.text) + "".join(l2.text)
    doc1 = Document(text)
    doc1.entities = SpanSet(span_list)
    doc1.domain = l1.domain
    doc_list.append(doc1)

doc_list = list(set(doc_list))
corpus = Corpus(doc_list)
corpus.write_to_file('./data/data_all.conllx')
def test_check_match():
    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity', '春'))
    span_set.append(Span(2, 3, 'entity', '秋'))
    assert span_set.check_match('赛春秋')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity', '春'))
    span_set.append(Span(4, 6, 'entity', '秋天'))
    assert span_set.check_match('赛春秋赛秋天')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity', '赛春秋'))
    span_set.append(Span(2, 3, 'entity', '春'))
    assert span_set.check_match('赛赛春秋')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity', '赛春秋'))
    span_set.append(Span(2, 3, 'entity', '春'))
    check_result = span_set.check_match('不不不不')
    assert check_result[0] == False
    assert check_result[1] == [
        Span(1, 4, 'entity', '赛春秋'),
        Span(2, 3, 'entity', '春')
    ]
Exemple #17
0
def two_add_before_link_after(map_data, file1, file2, before, link, after,
                              domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    before_list = read_raw_data(before)
    link_list = read_raw_data(link)
    after_list = read_raw_data(after)
    len_all = min(len(list1), len(list2))

    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        link = choice(link_list)
        before = choice(before_list)
        before2 = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        link_end = line_end_remove(link)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1 + file2
        len1 = l1 + l2

        doc1 = Document(len1)
        doc1.domain = domain

        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # file1 + link + file2
        len2 = l1 + link + l2

        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + file2
        len3 = before + l1 + l2

        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # before + file1 + file2 + after
        len4 = before + l1 + l2 + after

        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list4 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

        # before + file1 + link + file2
        len5 = before + l1 + link + l2

        doc5 = Document(len5)
        doc5.domain = domain
        doc5.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list5 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc5.entities = SpanSet(span_list5)
        doc_list.append(doc5)

        # before + file1 + link + file2 + after
        len6 = before + l1 + link + l2 + after

        doc6 = Document(len6)
        doc6.domain = domain
        doc6.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list6 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc6.entities = SpanSet(span_list6)
        doc_list.append(doc6)

        # file1 + file2 + after
        len7 = l1 + l2 + after

        doc7 = Document(len7)
        doc7.domain = domain
        doc7.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list7 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc7.entities = SpanSet(span_list7)
        doc_list.append(doc7)

        # file1 + link + file2 + after
        len8 = l1 + link + l2 + after

        doc8 = Document(len8)
        doc8.domain = domain
        doc8.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list8 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc8.entities = SpanSet(span_list8)
        doc_list.append(doc8)

        # file1 + before + file2
        len9 = l1 + before + l2

        doc9 = Document(len9)
        doc9.domain = domain
        doc9.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list9 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc9.entities = SpanSet(span_list9)
        doc_list.append(doc9)

        # file1 + before + file2 + after
        len10 = l1 + before + l2 + after

        doc10 = Document(len10)
        doc10.domain = domain
        doc10.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list10 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc10.entities = SpanSet(span_list10)
        doc_list.append(doc10)

        # file1 + link + before + file2
        len11 = l1 + link + before + l2

        doc11 = Document(len11)
        doc11.domain = domain
        doc11.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list11 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc11.entities = SpanSet(span_list11)
        doc_list.append(doc11)

        # file1 + link + before + file2 + after
        len12 = l1 + link + before + l2 + after

        doc12 = Document(len12)
        doc12.domain = domain
        doc12.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list12 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc12.entities = SpanSet(span_list12)
        doc_list.append(doc12)

        # before + file1 + before2 + file2
        len13 = before + l1 + before2 + l2

        doc13 = Document(len13)
        doc13.domain = domain
        doc13.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list13 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc13.entities = SpanSet(span_list13)
        doc_list.append(doc13)

        # before + file1 + before2 + file2 + after
        len14 = before + l1 + before2 + l2 + after

        doc14 = Document(len14)
        doc14.domain = domain
        doc14.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list14 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc14.entities = SpanSet(span_list14)
        doc_list.append(doc14)

        # before + file1 + link + before2 + file2
        len15 = before + l1 + link + before2 + l2

        doc15 = Document(len15)
        doc15.domain = domain
        doc15.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list15 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc15.entities = SpanSet(span_list15)
        doc_list.append(doc15)

        # before + file1 + link + before2 + file2 + after
        len16 = before + l1 + link + before2 + l2 + after

        doc16 = Document(len16)
        doc16.domain = domain
        doc16.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list16 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc16.entities = SpanSet(span_list16)
        doc_list.append(doc16)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + ".conllx"
    corpus.write_to_file(res_path)
def fasttext_parser(data_generator_func):
    for sentence in data_generator_func():
        data = '__label_{}  {}'.format(sentence.label, ' '.join(sentence.text))
        yield data


if __name__ == "__main__":
    from uuid import UUID

    from tokenizer_tools.tagset.offset.sequence import Sequence
    from tokenizer_tools.tagset.offset.span import Span
    from tokenizer_tools.tagset.offset.span_set import SpanSet

    data = [
        Sequence(text='我要听周杰伦的青花瓷', span_set=SpanSet([Span(7, 10, '地点')]),
                 id=UUID('59139985-367e-44c3-8540-b6340d07f79e'), label='媒体'),
        Sequence(text='我要听周杰伦的夜曲', span_set=SpanSet([Span(7, 10, '地点')]),
                 id=UUID('59139985-367e-44c3-8540-b6340d07f79e'), label='媒体')
    ]

    def faked_data_generator_func():
        return data

    result = fasttext_parser(faked_data_generator_func)

    for i in result:
        print(i)
def test_eq_():
    a = SpanSet()
    a.append(Span(1, 2, 'entity'))
    a.append(Span(2, 3, 'entity'))

    b = SpanSet()
    b.append(Span(1, 2, 'entity'))
    b.append(Span(2, 3, 'entity'))

    assert a == b

    c = SpanSet()  # empty SpanSet

    assert a != c

    d = SpanSet()  # same with `a` but different span order
    d.append(Span(2, 3, 'entity'))
    d.append(Span(1, 2, 'entity'))

    assert a == d

    e = SpanSet()  # same with `a` but different span order
    e.append(Span(0, 1, 'entity'))
    e.append(Span(1, 2, 'entity'))

    assert a != e
Exemple #20
0
def test_express_pattern(datadir):
    corpus = Corpus.read_from_file(datadir / "corpus.conllx")

    express_pattern = ExpressPattern(corpus)
    result = express_pattern.compute()

    expected = {
        ("<PERSON>", "在", "<GPE>", "的", "<ORG>", "读", "书", "。"): [
            Document(
                text=[
                    "王",
                    "小",
                    "明",
                    "在",
                    "北",
                    "京",
                    "的",
                    "清",
                    "华",
                    "大",
                    "学",
                    "读",
                    "书",
                    "。",
                ],
                span_set=SpanSet(
                    [
                        Span(0, 3, "PERSON", value=None, normal_value=None),
                        Span(4, 6, "GPE", value=None, normal_value=None),
                        Span(7, 11, "ORG", value=None, normal_value=None),
                    ]
                ),
                id="1",
                label=None,
                extra_attr={},
            ),
            Document(
                text=[
                    "王",
                    "小",
                    "明",
                    "在",
                    "台",
                    "北",
                    "新",
                    "竹",
                    "的",
                    "清",
                    "华",
                    "大",
                    "学",
                    "读",
                    "书",
                    "。",
                ],
                span_set=SpanSet(
                    [
                        Span(0, 3, "PERSON", value=None, normal_value=None),
                        Span(4, 8, "GPE", value=None, normal_value=None),
                        Span(9, 13, "ORG", value=None, normal_value=None),
                    ]
                ),
                id="3",
                label=None,
                extra_attr={},
            ),
        ],
        ("来", "一", "首", "<歌手名>", "的", "歌", "。"): [
            Document(
                text=["来", "一", "首", "蓝", "泽", "雨", "的", "歌", "。"],
                span_set=SpanSet([Span(3, 6, "歌手名", value=None, normal_value=None)]),
                id="2",
                label=None,
                extra_attr={},
            )
        ],
    }

    assert result == expected