Beispiel #1
0
def test_int_indexing():
    """
    Test indexing by the integer position in the sentence (int).
    """
    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)

    test_token = sentence[7]
    assert_token_members(test_token, '8', 'contenu', 'contenu', 'NOUN', None, {
        'Gender': set(('Masc', )),
        'Number': set(('Sing', ))
    }, '9', 'nsubj', {}, {})
Beispiel #2
0
def test_multiple_features_modify():
    """
    Test modification of features.
    """
    token_line = '28	une	un	DET	_	' \
        'Definite=Ind|Gender=Fem|Number=Sing|PronType=Art	30	det	_	_\n'
    token = Token(token_line)

    assert_token_members(
        token, '28', 'une', 'un', 'DET', None, {
            'Definite': set(('Ind', )),
            'Gender': set(('Fem', )),
            'Number': set(('Sing', )),
            'PronType': set(('Art', ))
        }, '30', 'det', {}, {})

    # Somehow this word is definite and indefinite!
    token.feats['Definite'].add('Def')

    assert_token_members(
        token, '28', 'une', 'un', 'DET', None, {
            'Definite': set(('Ind', 'Def')),
            'Gender': set(('Fem', )),
            'Number': set(('Sing', )),
            'PronType': set(('Art', ))
        }, '30', 'det', {}, {})
Beispiel #3
0
def test_load_from_file_and_url_equivalence():
    """
    Test that the Conll object created from a string and file is the same if
    the underlying source is the same.
    """
    TEST_CONLL_URL = 'https://myconllrepo.com/english/train'
    with open(fixture_location('long.conll')) as f:
        contents = f.read()
        responses.add(responses.GET, TEST_CONLL_URL, body=contents)

    url_c = load_from_url(TEST_CONLL_URL)
    file_c = load_from_file(fixture_location('long.conll'))

    assert len(url_c) == len(file_c)
    for i in range(len(url_c)):
        assert url_c[i].id == file_c[i].id
        assert url_c[i].text == file_c[i].text
        print(url_c[i].conll())
        print(file_c[i].conll())

        for url_token in url_c[i]:
            file_token = file_c[i][url_token.id]
            assert_token_members(url_token, file_token.id, file_token.form,
                                 file_token.lemma, file_token.upos,
                                 file_token.xpos, file_token.feats,
                                 file_token.head, file_token.deprel,
                                 file_token.deps, file_token.misc)
Beispiel #4
0
def test_only_form_and_lemma():
    """
    Test construction when token line only has a form and lemma.
    """
    token_line = '10.1	micro-pays	micro-pays	_	_	_	_	_	_	_\n'
    token = Token(token_line)

    assert_token_members(token, '10.1', 'micro-pays', 'micro-pays', None, None,
                         {}, None, None, {}, {})
Beispiel #5
0
def test_empty_lemma_empty_form_with_assumption():
    """
    Test that a Token with no form or lemma  with the empty assumption gets values of None.
    """
    token_line = '33	_	_	SYM	_	_	30	punct	_	SpaceAfter=No'
    token = Token(token_line, empty=True)

    assert_token_members(token, '33', None, None, 'SYM', None, {}, '30',
                         'punct', {}, {'SpaceAfter': set(('No', ))})
Beispiel #6
0
def test_empty_lemma_present_form():
    """
    Test construction of token without empty assumption and no lemma but a present form.
    """
    token_line = '33	_	hate	VERB	_	_	30	nmod	_	SpaceAfter=No'
    token = Token(token_line)

    assert_token_members(token, '33', None, 'hate', 'VERB', None, {}, '30',
                         'nmod', {}, {'SpaceAfter': set(('No', ))})
Beispiel #7
0
def test_underscore_construction():
    """
    Test construction of token without empty assumption and no form or lemma.
    """
    token_line = '33	_	_	PUN	_	_	30	nmod	_	SpaceAfter=No'
    token = Token(token_line)

    assert_token_members(token, '33', '_', '_', 'PUN', None, {}, '30', 'nmod',
                         {}, {'SpaceAfter': set(('No', ))})
Beispiel #8
0
def test_multiword_construction():
    """
    Test the creation of a token that is a multiword token line.
    """
    token_line = '8-9	du	_	_	_	_	_	_	_	_'
    token = Token(token_line)

    assert_token_members(token, '8-9', 'du', None, None, None, {}, None, None,
                         {}, {})
    assert token.is_multiword()
Beispiel #9
0
def test_construction():
    """
    Test the normal construction of a general token.
    """
    token_line = '7	vie	vie	NOUN	_	Gender=Fem|Number=Sing	4	nmod	_	SpaceAfter=No\n'
    token = Token(token_line)

    assert_token_members(token, '7', 'vie', 'vie', 'NOUN', None, {
        'Gender': set(('Fem', )),
        'Number': set(('Sing', ))
    }, '4', 'nmod', {}, {'SpaceAfter': set(('No', ))})
Beispiel #10
0
def test_construction_no_newline():
    """
    Test the construction of a token with no newline at the end of the line.
    """
    token_line = '7	vie	vie	NOUN	_	Gender=Fem|Number=Sing	4	nmod	_	_'
    token = Token(token_line)

    assert_token_members(token, '7', 'vie', 'vie', 'NOUN', None, {
        'Gender': set(('Fem', )),
        'Number': set(('Sing', ))
    }, '4', 'nmod', {}, {})
Beispiel #11
0
def test_deps_construction():
    """
    Test construction of a token when the deps field is present.
    """
    token_line = '1	They	they	PRON	PRP	Case=Nom|Number=Plur	2	nsubj	2:nsubj|4:nsubj	_\n'
    token = Token(token_line)

    assert_token_members(token, '1', 'They', 'they', 'PRON', 'PRP', {
        'Case': set(('Nom', )),
        'Number': set(('Plur', ))
    }, '2', 'nsubj', {
        '2': ('nsubj', None, None, None),
        '4': ('nsubj', None, None, None)
    }, {})
Beispiel #12
0
def test_load_from_file_and_string_equivalence():
    """
    Test that the Conll object created from a string and file is the same if
    the underlying source is the same.
    """
    with open(fixture_location('long.conll')) as f:
        contents = f.read()
    str_c = load_from_string(contents)
    file_c = load_from_file(fixture_location('long.conll'))

    assert len(str_c) == len(file_c)
    for i in range(len(str_c)):
        assert str_c[i].id == file_c[i].id
        assert str_c[i].text == file_c[i].text
        print(str_c[i].conll())
        print(file_c[i].conll())

        for str_token in str_c[i]:
            file_token = file_c[i][str_token.id]
            assert_token_members(str_token, file_token.id, file_token.form,
                                 file_token.lemma, file_token.upos,
                                 file_token.xpos, file_token.feats,
                                 file_token.head, file_token.deprel,
                                 file_token.deps, file_token.misc)
Beispiel #13
0
def test_int_slice_indexing_missing_value_stop():
    """
    Test that the sentence is properly sliced when the start or end is missing.
    """
    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)

    test_tokens = sentence[10:]
    assert_token_members(test_tokens[0], '11', 'selon', 'selon', 'ADP', None,
                         {}, '13', 'case', {}, {})
    assert_token_members(
        test_tokens[1], '12', 'les', 'le', 'DET', None, {
            'Definite': set(('Def', )),
            'Number': set(('Plur', )),
            'PronType': set(('Art', ))
        }, '13', 'det', {}, {})
    assert_token_members(test_tokens[2], '13', 'Facultés', 'Facultés', 'PROPN',
                         None, {}, '9', 'obl', {},
                         {'SpaceAfter': set(('No', ))})
    assert_token_members(test_tokens[3], '14', '.', '.', 'PUNCT', None, {},
                         '3', 'punct', {}, {})
Beispiel #14
0
def test_simple_sentence_construction():
    """
    Test the construction of a simple sentence.
    """
    source = ('# sent_id = fr-ud-dev_00003\n'
              '# text = Mais comment faire ?\n'
              '1	Mais	mais	CCONJ	_	_	3	cc	_	_\n'
              '2	comment	comment	ADV	_	_	3	advmod	_	_\n'
              '3	faire	faire	VERB	_	VerbForm=Inf	0	root	_	_\n'
              '4	?	?	PUNCT	_	_	3	punct	_	_\n')
    sentence = Sentence(source)

    assert sentence.id == 'fr-ud-dev_00003'
    assert sentence.text == 'Mais comment faire ?'
    assert len(sentence) == 4

    assert_token_members(sentence['1'], '1', 'Mais', 'mais', 'CCONJ', None, {},
                         '3', 'cc', {}, {})
    assert_token_members(sentence['2'], '2', 'comment', 'comment', 'ADV', None,
                         {}, '3', 'advmod', {}, {})
    assert_token_members(sentence['3'], '3', 'faire', 'faire', 'VERB', None,
                         {'VerbForm': set(('Inf', ))}, '0', 'root', {}, {})
    assert_token_members(sentence['4'], '4', '?', '?', 'PUNCT', None, {}, '3',
                         'punct', {}, {})
Beispiel #15
0
def test_str_slice_indexing_step():
    """
    Test slicing with string indices and with a step size.
    """
    source = (
        '# sent_id = fr-ud-dev_00002\n'
        '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n'
        '1	Les	le	DET	_	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	2	det	_	_\n'
        '2	études	étude	NOUN	_	Gender=Fem|Number=Plur	3	nsubj	_	_\n'
        '3	durent	durer	VERB	_	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	0	root	_	_\n'
        '4	six	six	NUM	_	_	5	nummod	_	_\n'
        '5	ans	an	NOUN	_	Gender=Masc|Number=Plur	3	obj	_	_\n'
        '6	mais	mais	CCONJ	_	_	9	cc	_	_\n'
        '7	leur	son	DET	_	Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs	8	det	_	_\n'
        '8	contenu	contenu	NOUN	_	Gender=Masc|Number=Sing	9	nsubj	_	_\n'
        '9	diffère	différer	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	3	conj	_	_\n'
        '10	donc	donc	ADV	_	_	9	advmod	_	_\n'
        '11	selon	selon	ADP	_	_	13	case	_	_\n'
        '12	les	le	DET	_	Definite=Def|Number=Plur|PronType=Art	13	det	_	_\n'
        '13	Facultés	Facultés	PROPN	_	_	9	obl	_	SpaceAfter=No\n'
        '14	.	.	PUNCT	_	_	3	punct	_	_')
    sentence = Sentence(source)

    test_tokens = sentence['1':'6':2]
    assert_token_members(
        test_tokens[0], '1', 'Les', 'le', 'DET', None, {
            'Definite': set(('Def', )),
            'Gender': set(('Fem', )),
            'Number': set(('Plur', )),
            'PronType': set(('Art', ))
        }, '2', 'det', {}, {})
    assert_token_members(
        test_tokens[1], '3', 'durent', 'durer', 'VERB', None, {
            'Mood': set(('Ind', )),
            'Number': set(('Plur', )),
            'Person': set(('3', )),
            'Tense': set(('Pres', )),
            'VerbForm': set(('Fin', ))
        }, '0', 'root', {}, {})
    assert_token_members(test_tokens[2], '5', 'ans', 'an', 'NOUN', None, {
        'Gender': set(('Masc', )),
        'Number': set(('Plur', )),
    }, '3', 'obj', {}, {})