def test_load_from_file_and_url_equivalence(): """ Test that the Conll object created from a string and file is the same if the underlying source is the same. """ TEST_CONLL_URL = 'https://myconllrepo.com/english/train' with open(fixture_location('long.conll')) as f: contents = f.read() responses.add(responses.GET, TEST_CONLL_URL, body=contents) url_c = load_from_url(TEST_CONLL_URL) file_c = load_from_file(fixture_location('long.conll')) assert len(url_c) == len(file_c) for i in range(len(url_c)): assert url_c[i].id == file_c[i].id assert url_c[i].text == file_c[i].text print(url_c[i].conll()) print(file_c[i].conll()) for url_token in url_c[i]: file_token = file_c[i][url_token.id] assert_token_members(url_token, file_token.id, file_token.form, file_token.lemma, file_token.upos, file_token.xpos, file_token.feats, file_token.head, file_token.deprel, file_token.deps, file_token.misc)
def test_append_contains(): """ Test that contains still works after appending an Sentence. """ with open(fixture_location('long.conll')) as f: c = Conll(f) sent = c[6] source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') new_sent = Sentence(source) other_sent = Sentence(source) other_sent.id = 'xyz' c.append(new_sent) assert new_sent in c assert sent in c assert other_sent not in c
def test_setitem(): """ Test that Sentences are properly assigned when using setitem. """ with open(fixture_location('basic.conll')) as f: c = Conll(f) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) c[1] = sentence assert c[1].conll() == source assert c[1].id == 'fr-ud-dev_00002'
def test_contains_non_existent_id(): """ Test that contains properly executes when the sentence id is unknown. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) source = ( '# sent_id = fr-ud-dev_00037' '# text = Thionville et Congerville furent créée en 1793 avec leur nom actuel et fusionnèrent en 1973.\n' '1 Thionville Thionville PROPN _ _ 5 nsubj:pass _ _\n' '2 et et CCONJ _ _ 3 cc _ _\n' '3 Congerville Congerville PROPN _ _ 1 conj _ _\n' '4 furent être AUX _ Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 5 aux:pass _ _\n' '5 créée créer VERB _ Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part 0 root _ _\n' '6 en en ADP _ _ 7 case _ _\n' '7 1793 1793 NUM _ _ 5 obl _ _\n' '8 avec avec ADP _ _ 10 case _ _\n' '9 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 10 det _ _\n' '10 nom nom NOUN _ Gender=Masc|Number=Sing 5 obl:mod _ _\n' '11 actuel actuel ADJ _ Gender=Masc|Number=Sing 10 amod _ _\n' '12 et et CCONJ _ _ 13 cc _ _\n' '13 fusionnèrent fusionner VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Past|VerbForm=Fin 5 conj _ _\n' '14 en en ADP _ _ 15 case _ _\n' '15 1973 1973 NUM _ _ 13 obl _ SpaceAfter=No\n' '16 . . PUNCT _ _ 5 punct _ _\n') sentence = Sentence(source) assert sentence not in conll
def test_contains_true(): """ Test that a Conll object can test for membership presence properly. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) conll.append(sentence) assert sentence in conll sentence['1'].pos = 'NOUN' assert sentence in conll
def test_insert(): """ Test that a sentence can be inserted to a Conll object. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) orig_length = len(conll) source = ( '# sent_id = fr-ud-dev_00002\n' '# text = Les études durent six ans mais leur contenu diffère donc selon les Facultés.\n' '1 Les le DET _ Definite=Def|Gender=Fem|Number=Plur|PronType=Art 2 det _ _\n' '2 études étude NOUN _ Gender=Fem|Number=Plur 3 nsubj _ _\n' '3 durent durer VERB _ Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin 0 root _ _\n' '4 six six NUM _ _ 5 nummod _ _\n' '5 ans an NOUN _ Gender=Masc|Number=Plur 3 obj _ _\n' '6 mais mais CCONJ _ _ 9 cc _ _\n' '7 leur son DET _ Gender=Masc|Number=Sing|Poss=Yes|PronType=Prs 8 det _ _\n' '8 contenu contenu NOUN _ Gender=Masc|Number=Sing 9 nsubj _ _\n' '9 diffère différer VERB _ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 3 conj _ _\n' '10 donc donc ADV _ _ 9 advmod _ _\n' '11 selon selon ADP _ _ 13 case _ _\n' '12 les le DET _ Definite=Def|Number=Plur|PronType=Art 13 det _ _\n' '13 Facultés Facultés PROPN _ _ 9 obl _ SpaceAfter=No\n' '14 . . PUNCT _ _ 3 punct _ _') sentence = Sentence(source) conll.insert(2, sentence) assert len(conll) == orig_length + 1 assert conll[2].id == 'fr-ud-dev_00002' assert len(conll[2]) == 14
def test_invalid_conll(): """ Test that an invalid sentence results in an invalid Conll object. """ with open(fixture_location('invalid.conll')) as f: with pytest.raises(ValueError): c = Conll(f)
def test_getitem_raises_typeerror(): """ Test that a non integer or slice key raises a TypeError. """ with open(fixture_location('basic.conll')) as f: c = Conll(f) with pytest.raises(TypeError): sent = c['error']
def test_writing_output(): """ Test that CoNLL files are properly created. """ with open(fixture_location('basic.conll')) as f: contents_basic = f.read() f.seek(0) conll = Conll(f) output_loc = fixture_location('output.conll') with open(output_loc, 'w') as f: conll.write(f) with open(output_loc) as f: contents_write = f.read() os.remove(fixture_location('output.conll')) assert contents_basic == contents_write
def test_ngram_first_word_match(): """ Test that a first word match is not enough to match. """ c = load_from_file(fixture_location('long.conll')) it = find_ngrams(c, 'un cabinet'.split()) with pytest.raises(StopIteration): next(it)
def test_ngram_standard(): """ Test if the find_ngram method works for standard situations. """ c = load_from_file(fixture_location('basic.conll')) s, i = next(find_ngrams(c, 'un film sur la'.split())) assert s.id == 'fr-ud-dev_00001' assert i == 2
def test_ngram_none(): """ Test that no ngram is identified when no exist """ c = load_from_file(fixture_location('long.conll')) it = find_ngrams(c, 'cabinet'.split()) with pytest.raises(StopIteration): next(it)
def test_no_nonprojectivities(): """ Test with a sentence with no non-projective dependencies. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[0] deps = find_nonprojective_deps(sent) assert not deps
def test_numeric_indexing(): """ Test the ability to index sentences through their numeric position. """ with open(fixture_location('basic.conll')) as f: conll = Conll(f) assert len(conll[0]) == 10 assert conll[0].id == 'fr-ud-dev_00001'
def test_delitem_single_int(): """ Test that Sentences keyed by index are properly deleted from Conll objects. """ with open(fixture_location('basic.conll')) as f: c = Conll(f) del c[2] assert len(c) == 3 assert c[2].id == 'fr-ud-dev_00004'
def test_multiword_ignore(): """ Test that multiword tokens are ignored and do not cause errors. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[3] deps = find_nonprojective_deps(sent) assert deps == [(sent['16'], sent['4'])]
def test_string_output(): """ Test that the strings are properly created. """ with open(fixture_location('basic.conll')) as f: contents = f.read() f.seek(0) conll = Conll(f) assert contents == conll.conll()
def test_load_from_file(): """ Test that a CoNLL file can properly be loaded from a filename. """ c = load_from_file(fixture_location('basic.conll')) sent = c[1] assert len(c) == 4 assert len(sent) == 14 assert sent['10'].form == 'donc'
def test_iter_from_file(): """ Test that CoNLL files can be iterated over without memory given the filename. """ expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)] actual_ids = [ sent.id for sent in iter_from_file(fixture_location('basic.conll')) ] assert expected_ids == actual_ids
def test_iter_from_string(): """ Test that CoNLL files in string form can be iterated over without memory. """ with open(fixture_location('basic.conll')) as f: contents = f.read() expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)] actual_ids = [sent.id for sent in iter_from_string(contents)] assert expected_ids == actual_ids
def test_multiple_nonprojectivities(): """ Test that multiple disjoint projectivities are properly identified. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[5] deps = find_nonprojective_deps(sent) assert set(deps) == set([(sent['22'], sent['3']), (sent['22'], sent['21']), (sent['28'], sent['25'])])
def test_overlapping_nonprojectivities(): """ Test that multiple non-projectivities can overlap. """ c = load_from_file(fixture_location('projectivities.conll')) sent = c[4] deps = find_nonprojective_deps(sent) assert set(deps) == set([(sent['16'], sent['4']), (sent['16'], sent['11'])])
def test_delitem_contains(): """ Test that the contains method still works after deletion. """ with open(fixture_location('long.conll')) as f: c = Conll(f) sent = c[1] assert sent in c del c[1] assert sent not in c
def test_par_and_doc_id_basic(): """ Test that the paragraph and document ids are properly associated with the Sentences. """ with open(fixture_location('par_doc_ids_basic.conll')) as f: c = Conll(f) expected_doc_ids = ['2', '2', '1', '1'] actual_doc_ids = [s.doc_id for s in c] assert expected_doc_ids == actual_doc_ids
def test_iter_from_network(): """ Test that a CoNLL file over a network can be iterated. """ TEST_CONLL_URL = 'https://myconllrepo.com/english/train' with open(fixture_location('basic.conll')) as f: responses.add(responses.GET, TEST_CONLL_URL, body=f.read()) expected_ids = ['fr-ud-dev_0000{}'.format(i) for i in range(1, 5)] actual_ids = [sent.id for sent in iter_from_url(TEST_CONLL_URL)] assert expected_ids == actual_ids
def test_iter_from_network_fail(): """ Test that a CoNLL file over a network can be iterated. """ TEST_CONLL_URL = 'https://myconllrepo.com/english/train' WRONG_URL = 'https://myconllrepo.com/english/gibberish' with open(fixture_location('basic.conll')) as f: responses.add(responses.GET, TEST_CONLL_URL, body=f.read()) with pytest.raises(requests.exceptions.RequestException): for sent in iter_from_url(WRONG_URL): pass
def test_sentence_line_numbers(): """ Test that the CoNLL files properly associate line numbers. """ sentence_bounds = [(1, 12), (14, 29), (31, 41), (43, 96)] with open(fixture_location('basic.conll')) as f: c = Conll(f) for i, sent in enumerate(c): cur_bounds = sentence_bounds[i] assert sent.start_line_number == cur_bounds[0] assert sent.end_line_number == cur_bounds[1]
def test_load_from_string(): """ Test that a CoNLL file can properly be loaded from a string. """ with open(fixture_location('basic.conll')) as f: contents = f.read() c = load_from_string(contents) sent = c[1] assert len(c) == 4 assert len(sent) == 14 assert sent['10'].form == 'donc'
def test_simple_nonprojectivities(): """ Test logic with a sentence with one single non-projectivity. """ c = load_from_file(fixture_location('projectivities.conll')) sent1 = c[3] deps1 = find_nonprojective_deps(sent1) sent2 = c[2] deps2 = find_nonprojective_deps(sent2) assert deps1 == [(sent1['16'], sent1['4'])] assert deps2 == [(sent2['8'], sent2['5'])]
def test_ngram_multiword_split(): """ Test that ngram searches still work when they go over a multiword token. """ c = load_from_file(fixture_location('long.conll')) it = find_ngrams(c, 'de " décentrement de le Sujet "'.split()) s, i = next(it) assert s.id == 'fr-ud-test_00002' assert i == 8 with pytest.raises(StopIteration): next(it)