Esempio n. 1
0
def test_txt(tmp_path):
    lines1 = ["text"] * 10
    filename = tmp_path / 'data.txt'
    pipe(lines1).write_file(filename, 'txt')
    lines2 = pipe().read_file(filename, 'txt').collect()

    assert lines2 == [l + '\n' for l in lines1]
Esempio n. 2
0
def test_merge_feats(data2):
    sentences1 = pipe().read_conllu(data2).unwind_feats().remove_fields(
        'feats').collect()
    sentences2 = pipe().read_conllu(data2).collect()
    assert [[t.get('feats') for t in s]
            for s in pipe(sentences1).merge_feats()
            ] == [[t.get('feats') for t in s] for s in sentences2]
Esempio n. 3
0
def test_only_fields(data2):
    sentences = pipe().read_conllu(data2).only_fields({'id', 'form'}).collect()
    assert [[t.keys() for t in s] for s in sentences
            ] == [[{'id', 'form'}] * len(s) for s in sentences]

    sentences = pipe().read_conllu(data2).only_fields('id', 'form').collect()
    assert [[t.keys() for t in s] for s in sentences
            ] == [[{'id', 'form'}] * len(s) for s in sentences]
Esempio n. 4
0
def test_pipe():
    p = pipe(range(10),
             pipe().filter(lambda x: x < 5),
             pipe(range(10)).map(lambda x: x * 2))
    assert p.collect() == [0, 2, 4, 6, 8]

    with pytest.raises(RuntimeError):
        pipe().collect()
Esempio n. 5
0
def test_to_instance(data2):
    sentences = pipe().read_conllu(data2).collect()
    index = pipe(sentences).create_index(fields=set(FIELDS) - {ID, HEAD})
    inverse_index = create_inverse_index(index)

    p = pipe().read_conllu(data2).to_instance(index)
    k = pipe(p).to_sentence(inverse_index)

    assert k.collect() == sentences
Esempio n. 6
0
def test_only_projective(data2, data5):
    p = pipe().read_conllu(data2).only_projective()
    assert [s.is_projective() for s in p.collect()] == [True, True]

    p = pipe().read_conllu(data5).only_projective()
    assert p.collect() == []

    p = pipe().read_conllu(data5).only_projective(False)
    assert [s.is_projective() for s in p.collect()] == [False]
Esempio n. 7
0
def test_batch():
    p = pipe(range(10)).filter(lambda x: x < 5).stream(10).batch(3)
    assert p.collect() == [[0, 1, 2], [3, 4, 0], [1, 2, 3], [4]]

    p = pipe(range(5)).batch(3, size=lambda _: 2)
    assert p.collect() == [[0, 1], [2, 3], [4]]

    p = pipe(range(5)).batch(3, size=lambda _: 3)
    assert p.collect() == [[0], [1], [2], [3], [4]]
Esempio n. 8
0
def test_replace_missing(data4):
    sentences = pipe().read_conllu(data4).collect()
    del sentences[0][0].form
    pipe(sentences).replace_missing('form', '__missing__', 'new').collect()
    assert [t.get('new') for t in sentences[0]
            ] == ['__missing__', None, None, None, None, None, None]

    pipe(sentences).replace_missing('form', None, 'new').collect()
    assert [t.get('new') for t in sentences[0]
            ] == [None, None, None, None, None, None, None]
Esempio n. 9
0
def test_from_conllu():
    p = pipe().from_conllu(_DATA1_CONLLU)
    assert [[t.form for t in s] for s in p.collect()
            ] == [['vámonos', 'vamos', 'nos', 'al', 'a', 'el', 'mar'],
                  ['Sue', 'likes', 'coffee', 'and', 'Bill', 'likes', 'tea']]

    with pytest.raises(RuntimeError):
        pipe().map(lambda x: x).from_conllu(_DATA1_CONLLU)

    with pytest.raises(RuntimeError):
        pipe(range(10)).from_conllu(_DATA1_CONLLU)
Esempio n. 10
0
def test_replace(data4):
    sentences = pipe().read_conllu(data4).replace(
        'form', r"[0-9]+|[0-9]+\.[0-9]+|[0-9]+[0-9,]+", '__number__',
        'new').collect()
    assert [t.get('new') for t in sentences[0]] == [
        'Posledná', 'revízia', 'vyšla', 'v', 'roku', '__number__', '.'
    ]

    del sentences[0][0].form
    pipe(sentences).replace('form', None, '__missing__', 'new').collect()
    assert [t.get('new') for t in sentences[0]] == [
        '__missing__', 'revízia', 'vyšla', 'v', 'roku', '__number__', '.'
    ]
Esempio n. 11
0
def test_remove_fields(data2):
    sentences = pipe().read_conllu(data2).remove_fields({'id',
                                                         'form'}).collect()
    assert [['id' in t.keys() for t in s]
            for s in sentences] == [[False] * len(s) for s in sentences]
    assert [['form' in t.keys() for t in s]
            for s in sentences] == [[False] * len(s) for s in sentences]

    sentences = pipe().read_conllu(data2).remove_fields('id', 'form').collect()
    assert [['id' in t.keys() for t in s]
            for s in sentences] == [[False] * len(s) for s in sentences]
    assert [['form' in t.keys() for t in s]
            for s in sentences] == [[False] * len(s) for s in sentences]
Esempio n. 12
0
def test_flatten_tokens(data1):
    tokens = pipe().read_conllu(data1).flatten().only_words().lowercase(
        'form').collect()
    assert [t.form for t in tokens] == [
        'vamos', 'nos', 'a', 'el', 'mar', 'sue', 'likes', 'coffee', 'and',
        'bill', 'tea'
    ]
Esempio n. 13
0
def test_upos_feats(data2):
    sentences = pipe().read_conllu(data2).upos_feats('new').collect()
    assert [t.get('new') for t in sentences[0]] == [
        'POS=PRON|Case=Nom|Number=Plur',
        'POS=VERB|Number=Plur|Person=3|Tense=Pres', 'POS=CONJ',
        'POS=VERB|Number=Plur|Person=3|Tense=Pres', 'POS=NOUN|Number=Plur',
        'POS=PUNCT'
    ]
Esempio n. 14
0
def test_only_universal_deprel(data4):
    sentences = pipe().read_conllu(data4).collect()
    sentences[0][0].deprel = 'test:test'
    sentences[0][0].deps = '1:test1:test:test|2:test2'
    sentences = pipe(sentences).only_universal_deprel().collect()

    assert sentences[0][0].deprel == 'test'
    assert sentences[0][0].deps == '1:test1|2:test2'
    assert sentences[0][-1].deps == '3:punct'

    sentences = pipe().read_conllu(data4, parse_deps=True).collect()
    sentences[0][0].deps = {(1, 'test1:test'), (2, 'test2')}
    sentences = pipe(sentences).only_universal_deprel().collect()

    assert sentences[0][0].deprel == 'amod'
    assert sentences[0][0].deps == {(1, 'test1'), (2, 'test2')}
    assert sentences[0][-1].deps == {(3, 'punct')}
Esempio n. 15
0
def test_unwind_feats(data2):
    sentences = pipe().read_conllu(data2).unwind_feats().collect()
    assert [[t.get('feats:Number') for t in s] for s in sentences
            ] == [['Plur', 'Plur', None, 'Plur', 'Plur', None],
                  ['Sing', 'Sing', None, 'Sing', None]]
    assert [[t.get('feats:Case') for t in s]
            for s in sentences] == [['Nom', None, None, None, None, None],
                                    ['Nom', None, None, None, None]]
Esempio n. 16
0
def test_hdf5(data2, data3, tmp_path):
    index = pipe().read_conllu(data2).create_index()
    instances1 = pipe().read_conllu(data2).to_instance(index).collect()

    filename = tmp_path / 'data.hdf5'
    pipe(instances1).write_file(filename, 'hdf5')
    instances2 = pipe().read_file(filename, 'hdf5').collect()

    for ins1, ins2 in zip(instances1, instances2):
        equal_instance(ins1, ins2)

    index = pipe().read_conllu(data3).create_index()
    instances1 = pipe().read_conllu(data3).to_instance(index).collect()

    filename = tmp_path / 'data.hdf5'
    pipe(instances1).write_file(filename, 'hdf5')
    instances2 = pipe().read_file(filename, 'hdf5').collect()

    for ins1, ins2 in zip(instances1, instances2):
        equal_instance(ins1, ins2)
Esempio n. 17
0
def test_to_flatten():
    p = pipe(range(10)).batch(3).flatten()
    assert p.collect() == list(range(10))

    p = pipe(range(10)).flatten()
    assert p.collect() == list(range(10))
Esempio n. 18
0
def test_collu(data2, tmp_path):
    sentences = pipe().read_conllu(data2).collect()
    filename = tmp_path / 'data.conllu'

    pipe(sentences).write_file(filename, 'conllu')
    assert pipe().read_file(filename, 'conllu').collect() == sentences
Esempio n. 19
0
def test_read_write_file():
    with pytest.raises(ValueError):
        pipe().read_file('temp', 'unknown').collect()
Esempio n. 20
0
def test_collect():
    assert pipe(range(5)).collect(l=[-1]) == list(range(-1, 5))
    assert pipe(range(5)).collect(3, [-1]) == list(range(-1, 3))
Esempio n. 21
0
def test_first():
    assert pipe(range(5)).first() == 0
    assert pipe([]).first() == None
    assert pipe([]).first(0) == 0
Esempio n. 22
0
def test_count():
    assert pipe(range(5)).count() == 5
    assert pipe([]).count() == 0
Esempio n. 23
0
def test_only_words(data1):
    p = pipe().read_conllu(data1).only_words().text()
    assert p.collect() == [
        'vamos nos a el mar ', 'Sue likes coffee and Bill tea '
    ]
Esempio n. 24
0
def test_map():
    p = pipe(range(10)).map(lambda x: 2 * x)
    assert p.collect() == [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
Esempio n. 25
0
def test_filter():
    p = pipe(range(10)).filter(lambda x: x < 5)
    assert p.collect() == [0, 1, 2, 3, 4]
Esempio n. 26
0
def test_split_chars(data2):
    sentences = pipe().read_conllu(data2).split_chars('form').collect()
    assert [[t['form:chars'] for t in s] for s in sentences
            ] == [[tuple(t.form) for t in s] for s in sentences]
Esempio n. 27
0
def test_uppercase(data1):
    p = pipe().read_conllu(data1).only_words().uppercase('form').text()
    assert p.collect() == [
        'vamos nos a el mar '.upper(),
        'Sue likes coffee and Bill tea '.upper()
    ]
Esempio n. 28
0
def test_filter_field(data1):
    tokens = pipe().read_conllu(data1).flatten().filter_field(
        'form', lambda s: False).collect()
    assert [t for t in tokens if 'form' in t] == []
Esempio n. 29
0
def test_map_token(data1):
    sentences = pipe().read_conllu(data1).map_token(lambda t: None).collect()
    assert sentences == [[], []]
Esempio n. 30
0
def test_filter_token(data1):
    sentences = pipe().read_conllu(data1).filter_token(
        lambda t: False).collect()
    assert sentences == [[], []]