def __read_conllu_vocabs(file: Path, *args, **kwargs) -> Tuple[Vocab, Vocab]: word_vocab, tag_vocab = Vocab(), Vocab() with open(file, **kwargs) as f: for line in f: if not line.strip() or line.strip().startswith("#"): continue tok = CoNLLUToken(*line.strip().split("\t")) word_vocab += tok.form tag_vocab += tok.upos return word_vocab, tag_vocab
def test_unknown_token_in_example_yields_unk_udposdataset(): word_vocab = Vocab() tag_vocab = Vocab() form = "Pierre" entity_tag = "B-PER" assert form not in word_vocab tag_vocab += entity_tag examples = [CoNLL2003Example([CoNLL2003Token(form, "", "", entity_tag)])] ds = CoNLL2003NERDataset(examples, word_vocab, tag_vocab) (token_vector, tag_vector) = ds[0] assert word_vocab.unk_index in token_vector assert tag_vocab.unk_index not in tag_vector
def test_unknown_token_and_tag_in_example_yields_unk(): word_vocab = Vocab() tag_vocab = Vocab() form = "Pierre" entity_tag = "B-PER" assert form not in word_vocab assert entity_tag not in tag_vocab examples = [CoNLL2003Example([CoNLL2003Token(form, "", "", entity_tag)])] ds = CoNLL2003Dataset(examples, word_vocab, tag_vocab, lambda t: t.entity_tag) (token_vector, tag_vector) = ds[0] assert word_vocab.unk_index in token_vector assert tag_vocab.unk_index in tag_vector
def test_uddataset_from_file(tmp_path, example_dataset, cls, func): file = tmp_path / "tmp.txt" word_vocab, tag_vocab = Vocab(), Vocab() with open(file, "w", encoding="utf-8") as f: f.write(example_dataset) if func: ds = cls.from_file(file, word_vocab, tag_vocab, func, encoding="utf-8") else: ds = cls.from_file(file, word_vocab, tag_vocab, encoding="utf-8") assert len(ds) == 2
def test_unknown_token_and_tag_in_example_yields_unk(): word_vocab = Vocab() tag_vocab = Vocab() form = "Pierre" pos = "NNP" assert form not in word_vocab assert pos not in tag_vocab examples = [ CoNLLUExample([CoNLLUToken(1, form, "", pos, "", "", 0, "", "", "")]) ] ds = UDDataset(examples, word_vocab, tag_vocab, lambda t: t.upos) (token_vector, tag_vector) = ds[0] assert word_vocab.unk_index in token_vector assert tag_vocab.unk_index in tag_vector
def test_unknown_token_in_example_yields_unk_udposdataset(): word_vocab = Vocab() tag_vocab = Vocab() form = "Pierre" pos = "NNP" assert form not in word_vocab tag_vocab += pos examples = [ CoNLLUExample([CoNLLUToken(1, form, "", pos, "", "", 0, "", "", "")]) ] ds = UDUPOSDataset(examples, word_vocab, tag_vocab) (token_vector, tag_vector) = ds[0] assert word_vocab.unk_index in token_vector assert tag_vocab.unk_index not in tag_vector
def tag_vocab(): vocab = Vocab() vocab += "ODD" vocab += "EVEN" return vocab
def word_vocab(): vocab = Vocab() for i in range(11): vocab += str(i) return vocab
def test_uddataset_len_is_len_examples(examples): word_vocab = Vocab() tag_vocab = Vocab() ds = UDDataset(examples, word_vocab, tag_vocab, lambda t: t.upos) assert len(ds) == len(ds)
def tag_vocab(): tag_vocab_ = Vocab() tag_vocab_ += "EVEN" tag_vocab_ += "ODD" return tag_vocab_
def word_vocab(): word_vocab_ = Vocab() for i in range(11): word_vocab_ += str(i) return word_vocab_
def test_uddataset_len_is_len_examples(examples): word_vocab = Vocab() tag_vocab = Vocab() ds = CoNLL2003NERDataset(examples, word_vocab, tag_vocab, lambda t: t.entity_tag) assert len(ds) == len(ds)
def nonempty_vocab(example_embedding): vocab = Vocab() vocab += example_embedding[0].split()[0] return vocab
def empty_vocab(example_embedding): return Vocab()