Exemple #1
0
def create_new_corpus(data_dict, corpus_vol, **kwargs):
    new_corpus = Corpus([])
    sem_nums = kwargs['sem_nums']
    intents = data_dict.keys()
    if not corpus_vol:
        return
    elif sem_nums > len(intents):
        return
    else:
        for i in range(corpus_vol):
            intent_sam = set()
            while len(intent_sam) < sem_nums:
                intent_sam.add(random.choice(list(intents)))
            spanset = SpanSet()
            sentences = []
            start_position = 0
            for intent in list(intent_sam):
                if intent == 'noise':
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    start_position += len(txt)
                else:
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    spanset.append(
                        Span(start=start_position,
                             end=start_position + len(txt),
                             entity=intent))
                    start_position += len(txt)
            doc = Document(text=''.join(sentences),
                           label='|'.join(intent_sam),
                           span_set=spanset)
            new_corpus.append(doc)

    return new_corpus
def test_eq_():
    a = SpanSet()
    a.append(Span(1, 2, 'entity'))
    a.append(Span(2, 3, 'entity'))

    b = SpanSet()
    b.append(Span(1, 2, 'entity'))
    b.append(Span(2, 3, 'entity'))

    assert a == b

    c = SpanSet()  # empty SpanSet

    assert a != c

    d = SpanSet()  # same with `a` but different span order
    d.append(Span(2, 3, 'entity'))
    d.append(Span(1, 2, 'entity'))

    assert a == d

    e = SpanSet()  # same with `a` but different span order
    e.append(Span(0, 1, 'entity'))
    e.append(Span(1, 2, 'entity'))

    assert a != e
Exemple #3
0
    def _turn_training_data_to_offset(training_data):
        from tokenizer_tools.tagset.offset.sequence import Sequence
        from tokenizer_tools.tagset.offset.span import Span
        from tokenizer_tools.tagset.offset.span_set import SpanSet

        for example in training_data.training_examples:
            span_set = SpanSet()

            text = [i for i in example.text]  # need to be str list (not str)
            intent = example.get("intent")

            for ent in example.get("entities", []):
                start, end, entity = ent["start"], ent["end"], ent["entity"]

                span_set.append(Span(start, end, entity))

            seq = Sequence(text, span_set, label=intent)

            yield seq
def test_check_overlap():
    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity'))
    span_set.append(Span(2, 3, 'entity'))
    assert span_set.check_overlap()[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity'))
    span_set.append(Span(4, 6, 'entity'))
    assert span_set.check_overlap()[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity'))
    span_set.append(Span(2, 3, 'entity'))
    check_result = span_set.check_overlap()
    assert check_result[0] == False
    assert check_result[1] == [(Span(1, 4, 'entity'), Span(2, 3, 'entity'))]
def test_check_match():
    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity', '春'))
    span_set.append(Span(2, 3, 'entity', '秋'))
    assert span_set.check_match('赛春秋')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 2, 'entity', '春'))
    span_set.append(Span(4, 6, 'entity', '秋天'))
    assert span_set.check_match('赛春秋赛秋天')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity', '赛春秋'))
    span_set.append(Span(2, 3, 'entity', '春'))
    assert span_set.check_match('赛赛春秋')[0] == True

    span_set = SpanSet()
    span_set.append(Span(1, 4, 'entity', '赛春秋'))
    span_set.append(Span(2, 3, 'entity', '春'))
    check_result = span_set.check_match('不不不不')
    assert check_result[0] == False
    assert check_result[1] == [
        Span(1, 4, 'entity', '赛春秋'),
        Span(2, 3, 'entity', '春')
    ]