Ejemplo n.º 1
0
 def __read_conllu_vocabs(file: Path, *args, **kwargs) -> Tuple[Vocab, Vocab]:
     word_vocab, tag_vocab = Vocab(), Vocab()
     with open(file, **kwargs) as f:
         for line in f:
             if not line.strip() or line.strip().startswith("#"):
                 continue
             tok = CoNLLUToken(*line.strip().split("\t"))
             word_vocab += tok.form
             tag_vocab += tok.upos
     return word_vocab, tag_vocab
Ejemplo n.º 2
0
def test_unknown_token_in_example_yields_unk_udposdataset():
    word_vocab = Vocab()
    tag_vocab = Vocab()
    form = "Pierre"
    entity_tag = "B-PER"
    assert form not in word_vocab
    tag_vocab += entity_tag
    examples = [CoNLL2003Example([CoNLL2003Token(form, "", "", entity_tag)])]
    ds = CoNLL2003NERDataset(examples, word_vocab, tag_vocab)
    (token_vector, tag_vector) = ds[0]
    assert word_vocab.unk_index in token_vector
    assert tag_vocab.unk_index not in tag_vector
Ejemplo n.º 3
0
def test_unknown_token_and_tag_in_example_yields_unk():
    word_vocab = Vocab()
    tag_vocab = Vocab()
    form = "Pierre"
    entity_tag = "B-PER"
    assert form not in word_vocab
    assert entity_tag not in tag_vocab
    examples = [CoNLL2003Example([CoNLL2003Token(form, "", "", entity_tag)])]
    ds = CoNLL2003Dataset(examples, word_vocab, tag_vocab, lambda t: t.entity_tag)
    (token_vector, tag_vector) = ds[0]
    assert word_vocab.unk_index in token_vector
    assert tag_vocab.unk_index in tag_vector
Ejemplo n.º 4
0
def test_uddataset_from_file(tmp_path, example_dataset, cls, func):
    file = tmp_path / "tmp.txt"
    word_vocab, tag_vocab = Vocab(), Vocab()

    with open(file, "w", encoding="utf-8") as f:
        f.write(example_dataset)

    if func:
        ds = cls.from_file(file, word_vocab, tag_vocab, func, encoding="utf-8")
    else:
        ds = cls.from_file(file, word_vocab, tag_vocab, encoding="utf-8")

    assert len(ds) == 2
Ejemplo n.º 5
0
def test_unknown_token_and_tag_in_example_yields_unk():
    word_vocab = Vocab()
    tag_vocab = Vocab()
    form = "Pierre"
    pos = "NNP"
    assert form not in word_vocab
    assert pos not in tag_vocab
    examples = [
        CoNLLUExample([CoNLLUToken(1, form, "", pos, "", "", 0, "", "", "")])
    ]
    ds = UDDataset(examples, word_vocab, tag_vocab, lambda t: t.upos)
    (token_vector, tag_vector) = ds[0]
    assert word_vocab.unk_index in token_vector
    assert tag_vocab.unk_index in tag_vector
Ejemplo n.º 6
0
def test_unknown_token_in_example_yields_unk_udposdataset():
    word_vocab = Vocab()
    tag_vocab = Vocab()
    form = "Pierre"
    pos = "NNP"
    assert form not in word_vocab
    tag_vocab += pos
    examples = [
        CoNLLUExample([CoNLLUToken(1, form, "", pos, "", "", 0, "", "", "")])
    ]
    ds = UDUPOSDataset(examples, word_vocab, tag_vocab)
    (token_vector, tag_vector) = ds[0]
    assert word_vocab.unk_index in token_vector
    assert tag_vocab.unk_index not in tag_vector
Ejemplo n.º 7
0
def tag_vocab():
    vocab = Vocab()
    vocab += "ODD"
    vocab += "EVEN"
    return vocab
Ejemplo n.º 8
0
def word_vocab():
    vocab = Vocab()
    for i in range(11):
        vocab += str(i)
    return vocab
Ejemplo n.º 9
0
def test_uddataset_len_is_len_examples(examples):
    word_vocab = Vocab()
    tag_vocab = Vocab()
    ds = UDDataset(examples, word_vocab, tag_vocab, lambda t: t.upos)
    assert len(ds) == len(ds)
Ejemplo n.º 10
0
def tag_vocab():
    tag_vocab_ = Vocab()
    tag_vocab_ += "EVEN"
    tag_vocab_ += "ODD"
    return tag_vocab_
Ejemplo n.º 11
0
def word_vocab():
    word_vocab_ = Vocab()
    for i in range(11):
        word_vocab_ += str(i)
    return word_vocab_
Ejemplo n.º 12
0
def test_uddataset_len_is_len_examples(examples):
    word_vocab = Vocab()
    tag_vocab = Vocab()
    ds = CoNLL2003NERDataset(examples, word_vocab, tag_vocab, lambda t: t.entity_tag)
    assert len(ds) == len(ds)
Ejemplo n.º 13
0
def nonempty_vocab(example_embedding):
    vocab = Vocab()
    vocab += example_embedding[0].split()[0]
    return vocab
Ejemplo n.º 14
0
def empty_vocab(example_embedding):
    return Vocab()