def test_load_conllu_corpus_plus_in_memory(tasks_base_path):
    corpus = CoNLLUCorpus(
        tasks_base_path / "conllu",
        train_file="train.conllup",
        dev_file="train.conllup",
        test_file="train.conllup",
        in_memory=True,
    )

    assert len(corpus.train) == 4
    assert len(corpus.dev) == 4
    assert len(corpus.test) == 4

    _assert_conllu_dataset(corpus.train)
Beispiel #2
0
def test_load_conllu_corpus_plus_in_memory(tasks_base_path):
    corpus = CoNLLUCorpus(
        tasks_base_path / "conllu",
        train_file="train.conllup",
        dev_file="train.conllup",
        test_file="train.conllup",
        token_annotation_fields=["upos", "ner"],
        in_memory=True,
    )

    assert len(corpus.train) == 4
    assert len(corpus.dev) == 4
    assert len(corpus.test) == 4

    _assert_conllu_dataset(corpus.train)
def test_load_conllu_corpus_in_memory(tasks_base_path):
    corpus = CoNLLUCorpus(
        tasks_base_path / "conllu",
        fields=["id", "form", "ner", "misc"],
        train_file="train.conllu",
        dev_file="train.conllu",
        test_file="train.conllu",
        in_memory=True,
    )

    assert len(corpus.train) == 4
    assert len(corpus.dev) == 4
    assert len(corpus.test) == 4

    _assert_conllu_dataset(corpus.train)
Beispiel #4
0
def test_load_conllu_corpus(tasks_base_path):
    corpus = CoNLLUCorpus(
        tasks_base_path / "conllu",
        fields=["id", "form", "upos", "ner", "misc"],
        train_file="train.conllu",
        dev_file="train.conllu",
        test_file="train.conllu",
        token_annotation_fields=["upos", "ner"],
        in_memory=False,
    )

    assert len(corpus.train) == 4
    assert len(corpus.dev) == 4
    assert len(corpus.test) == 4

    _assert_conllu_dataset(corpus.train)
Beispiel #5
0
def test_load_universal_dependencies_conllu_corpus(tasks_base_path):
    """
    This test only covers basic universal dependencies datasets.
    For example, multi-word tokens or the "deps" column sentence annotations are not supported yet.
    """

    # Here, we use the default token annotation fields.
    corpus = CoNLLUCorpus(tasks_base_path / "conllu",
                          train_file="universal_dependencies.conllu",
                          dev_file="universal_dependencies.conllu",
                          test_file="universal_dependencies.conllu")

    assert len(corpus.train) == 1
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1

    _assert_universal_dependencies_conllu_dataset(corpus.train)