def test_parse_CoNLL2009_2(self): data = dedent("""\ #\tid='1'-document_id='36:1047'-span='1' 1\t+\t+\tPunc\tPunc\t_\t0\tROOT\t_\t_ 2\tIn\tin\tr\tr\tr|-|-|-|-|-|-|-|-\t5\tAuxP\t_\t_ 3\tDei\tDeus\tn\tPropn\tn|-|s|-|-|-|m|g|-\t4\tATR\t_\t_ 4\tnomine\tnomen\tn\tn\tn|-|s|-|-|-|n|b|-\t2\tADV\t_\t_ 5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_ """) sentences = parse( data, fields=('id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'), field_parsers={ "feats": lambda line, i: [feat for feat in line[i].split("|")] }) self.assertEqual( sentences[0][4], Token([ ('id', 5), ('form', 'regnante'), ('lemma', 'regno'), ('upostag', 't'), ('xpostag', 't'), ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']), ('head', 0), ('deprel', 'ADV'), ('deps', None), ('misc', None), ])) self.assertEqual(sentences[0].metadata, Token([('id', "'1'-document_id='36:1047'-span='1'")]))
def test_no_root_nodes(self): tokenlist = TokenList([ Token([('id', 1), ('form', 'To'), ('head', 1)]), Token([('id', 2), ('form', 'appear'), ('head', 2)]), ]) with self.assertRaises(ParseException): tokenlist.to_tree()
def test_invalid_key_access(self): token = Token({"id": 1, "xpostag": "DT", "upostag": "DET"}) with self.assertRaises(KeyError): token["inexistent_value"] self.assertEqual(token.get("inexistent_value"), None) self.assertEqual(token.get("inexistent_value", "HEJ"), "HEJ")
def test_custom_metadata_parsers(self): data = dedent("""\ # global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC # newdoc id = mf920901-001 # newpar id = mf920901-001-p1 # sent_id = mf920901-001-p1s1A # text = Slovenská ústava: pro i proti # text_en = Slovak constitution: pros and cons """) _, metadata = parse_token_and_metadata(data) self.assertEqual(metadata, Token([ ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"), ("newdoc id", "mf920901-001"), ("newpar id", "mf920901-001-p1"), ("sent_id", "mf920901-001-p1s1A"), ("text", "Slovenská ústava: pro i proti"), ("text_en", "Slovak constitution: pros and cons"), ])) _, metadata = parse_token_and_metadata( data, metadata_parsers={"global.columns": lambda key, value: (key, value.split())} ) self.assertEqual(metadata, Token([ ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]), ("newdoc id", "mf920901-001"), ("newpar id", "mf920901-001-p1"), ("sent_id", "mf920901-001-p1s1A"), ("text", "Slovenská ústava: pro i proti"), ("text_en", "Slovak constitution: pros and cons"), ]))
def test_flatten(self): tree = TokenTree(token=Token([("id", 2), ("form", "dog")]), children=[ TokenTree(token=Token([("id", 1), ("form", "a")]), children=[]) ]) self.assertEqual( tree.serialize(), dedent("""\ 1\ta 2\tdog """)) tree = TokenTree(token=Token([("id", 1), ("form", "dog")]), children=[ TokenTree(token=Token([("id", 2), ("form", "a")]), children=[]) ]) self.assertEqual( tree.serialize(), dedent("""\ 1\tdog 2\ta """))
def test_parse(self): sentences = parse(data) self.assertEqual(len(sentences), 1) sentence = sentences[0] self.assertEqual( str(sentence), "TokenList<The, quick, brown, fox, jumps, over, the, lazy, dog, .>" ) self.assertEqual( sentence[0], Token([('id', 1), ('form', 'The'), ('lemma', 'the'), ('upos', 'DET'), ('xpos', 'DT'), ('feats', Token([('Definite', 'Def'), ('PronType', 'Art')])), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', None)])) self.assertEqual( sentence[8], Token([('id', 9), ('form', 'dog'), ('lemma', 'dog'), ('upos', 'NOUN'), ('xpos', 'NN'), ('feats', Token([('Number', 'Sing')])), ('head', 5), ('deprel', 'nmod'), ('deps', None), ('misc', Token([("SpaceAfter", "No")]))])) self.assertEqual( [token["form"] for token in sentence], "The quick brown fox jumps over the lazy dog .".split(" ")) self.assertEqual(sentence.metadata["text"], "The quick brown fox jumps over the lazy dog.")
def test_ordered_dict(self): data = Token() self.assertEqual(serialize_field(data), "") data = Token([('SpaceAfter', 'No')]) self.assertEqual(serialize_field(data), "SpaceAfter=No") data = Token([('Translit', None)]) self.assertEqual(serialize_field(data), "Translit=_")
def test_parse_line(self): line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_" self.assertEqual( parse_line(line, fields=DEFAULT_FIELDS), Token([('id', 1), ('form', 'The'), ('lemma', 'the'), ('upos', 'DET'), ('xpos', 'DT'), ('feats', Token([('Definite', 'Def'), ('PronType', 'Art')])), ('head', 4), ('deprel', 'det'), ('deps', None), ('misc', None)]))
def test_custom_fields(self): data = dedent("""\ 1\t1\t1 2\t2\t2 """) tokens, _ = parse_token_and_metadata(data, fields=("id", "id", "id")) self.assertEqual(tokens, [ Token([("id", 1), ("id", 1), ("id", 1)]), Token([("id", 2), ("id", 2), ("id", 2)]), ])
def test_simple_tree(self): tokenlist = TokenList([ Token([("id", 2), ("form", "dog"), ("head", 0)]), Token([("id", 1), ("form", "a"), ("head", 2)]), ]) tree = TokenTree(token=Token([("id", 2), ("form", "dog"), ("head", 0)]), children=[ TokenTree(token=Token([("id", 1), ("form", "a"), ("head", 2)]), children=[]) ]) self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_removes_negative_nodes(self): tokenlist = TokenList([ Token([("id", 2), ("form", "dog"), ("head", 0)]), Token([("id", 1), ("form", "a"), ("head", 2)]), Token([("id", 3), ("form", "😍"), ("head", -1)]), ]) tree = TokenTree(token=Token([("id", 2), ("form", "dog"), ("head", 0)]), children=[ TokenTree(token=Token([("id", 1), ("form", "a"), ("head", 2)]), children=[]) ]) self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_default_field_parsers_when_undefined(self): data = dedent("""\ 1\tfrom 2\tparis """) fields = ("id", "form") field_parsers = { # Rely on default 'id' field parser "form": lambda line, i: line[i].upper() } tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers) self.assertEqual(tokens, [ Token([("id", 1), ("form", "FROM")]), Token([("id", 2), ("form", "PARIS")]), ])
def test_parse_line_nullable_fields(self): line = "_\t_\t_\t_\t_\t_\t_\t_\t_\t_" self.assertEqual( parse_line(line, fields=DEFAULT_FIELDS), Token([('id', None), ('form', '_'), ('lemma', '_'), ('upos', '_'), ('xpos', None), ('feats', None), ('head', None), ('deprel', '_'), ('deps', None), ('misc', None)]))
def test_remove(self): tokenlist = TokenList([{"id": 1}, {"id": 2}]) tokenlist.remove(Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 2}])) tokenlist.remove({"id": 2}) self.assertEqual(tokenlist, TokenList())
def test_parse_line_only_id_head(self): line = "1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t4\tdet\t_\t_" self.assertEqual(parse_line(line, fields=["id", "form"]), Token([ ('id', 1), ('form', 'The'), ]))
def test_parse_line_two_spaces(self): line = "1 The the DET DT Definite=Def|PronType=Art 4 det _ _" self.assertEqual(parse_line(line, fields=["id", "form"]), Token([ ('id', 1), ('form', 'The'), ]))
def test_parse_conllu_plus(self): data = dedent("""\ # global.columns = ID FORM UPOS HEAD DEPREL MISC PARSEME:MWE # source_sent_id = conllu http://hdl.handle.net/11234/1-2837 UD_German-GSD/de_gsd-ud-train.conllu train-s16 # sent_id = train-s16 # text = Der CDU-Politiker strebt 1\tDer\tDET\t2\tdet\t_\t* 2\tCDU\tPROPN\t4\tcompound\tSpaceAfter=No\t* 3\t-\tPUNCT\t2\tpunct\tSpaceAfter=No\t* 4\tPolitiker\tNOUN\t5\tnsubj\t_\t* 5\tstrebt\tVERB\t0\troot\t_\t2:VPC.full """) sentences = parse(data) self.assertEqual( sentences[0][4], Token([ ('id', 5), ('form', 'strebt'), ('upos', 'VERB'), ('head', 0), ('deprel', 'root'), ('misc', None), ('parseme:mwe', '2:VPC.full'), ]))
def test_parse_tree(self): sentences = parse_tree(data) self.assertEqual(len(sentences), 1) root = sentences[0] self.assertEqual( str(root), "TokenTree<token={id=5, form=jumps}, children=[...]>") self.assertEqual( root.token, Token([('id', 5), ('form', 'jumps'), ('lemma', 'jump'), ('upos', 'VERB'), ('xpos', 'VBZ'), ('feats', Token([ ("Mood", "Ind"), ("Number", "Sing"), ("Person", "3"), ("Tense", "Pres"), ("VerbForm", "Fin"), ])), ('head', 0), ('deprel', 'root'), ('deps', None), ('misc', None)])) self.assertEqual([str(child) for child in root.children], [ "TokenTree<token={id=4, form=fox}, children=[...]>", "TokenTree<token={id=9, form=dog}, children=[...]>", "TokenTree<token={id=10, form=.}, children=None>", ]) self.assertEqual(root.metadata["text"], "The quick brown fox jumps over the lazy dog.") self.assertEqual(root.serialize(), data) self.assertEqual( capture_print(root.print_tree), dedent("""\ (deprel:root) form:jumps lemma:jump upos:VERB [5] (deprel:nsubj) form:fox lemma:fox upos:NOUN [4] (deprel:det) form:The lemma:the upos:DET [1] (deprel:amod) form:quick lemma:quick upos:ADJ [2] (deprel:amod) form:brown lemma:brown upos:ADJ [3] (deprel:nmod) form:dog lemma:dog upos:NOUN [9] (deprel:case) form:over lemma:over upos:ADP [6] (deprel:det) form:the lemma:the upos:DET [7] (deprel:amod) form:lazy lemma:lazy upos:ADJ [8] (deprel:punct) form:. lemma:. upos:PUNCT [10] """))
def test_newlines_in_sentence(self): data = dedent("""\ # meta = data 1\thej 2\tdå 3\thej 4\tdå """) tokens, metadata = parse_token_and_metadata(data) self.assertListEqual(tokens, [ Token([("id", 1), ("form", "hej")]), Token([("id", 2), ("form", "då")]), Token([("id", 3), ("form", "hej")]), Token([("id", 4), ("form", "då")]), ]) self.assertEqual(metadata, Token([("meta", "data")]))
def test_insert(self): tokenlist = TokenList() tokenlist.insert(0, Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 1}])) tokenlist.insert(1, {"id": 2}) self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}])) self.assertEqual(type(tokenlist[1]), Token)
def test_append(self): tokenlist = TokenList() tokenlist.append(Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 1}])) tokenlist.append({"id": 2}) self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}])) self.assertEqual(type(tokenlist[1]), Token)
def _tacred_example_to_token_list( self, example: Dict[str, Any]) -> conllu.TokenList: id_ = example["id"] tokens = example["token"] ner = example["stanford_ner"] subj_start = example["subj_start"] subj_end = example["subj_end"] obj_start = example["obj_start"] obj_end = example["obj_end"] subj_tag = example["subj_type"] obj_tag = example["obj_type"] label = example["relation"] metadata = { "text": " ".join(tokens), "sentence_id": str(id_), "relations": ";".join([ str(subj_start + 1), str(subj_end + 1), str(obj_start + 1), str(obj_end + 1), label, ]), } prev_tag = None token_dicts = [] for idx, (token, tag) in enumerate(zip(tokens, ner)): if subj_start <= idx <= subj_end: tag = subj_tag if obj_start <= idx <= obj_end: tag = obj_tag prefix = "" if tag != "O": if tag != prev_tag: prefix = "B-" else: prefix = "I-" prev_tag = tag token_dicts.append( Token({ "id": str(idx + 1), "form": convert_ptb_token(token), "ner": prefix + tag, })) return conllu.TokenList(tokens=token_dicts, metadata=Metadata(metadata))
def test_deep_filtering(self): tokenlist = TokenList([ {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])}, {"form": "quick", "feats": Token([('Degree', 'Pos')])}, {"form": "brown", "feats": Token([('Degree', 'Pos')])}, {"form": "fox", "feats": Token([('Number', 'Sing')])}, ]) self.assertEqual( tokenlist.filter(feats__Degree="Pos"), TokenList([ {"form": "quick", "feats": Token([('Degree', 'Pos')])}, {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown", feats__Degree="Pos"), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown", feats__Degree="Pos", id=1), TokenList([]) ) self.assertEqual( tokenlist.filter(unknown__property__value="undefined"), TokenList([]) ) self.assertEqual( tokenlist.filter(unknown___property____value="undefined"), TokenList([]) )
def test_nested_filtering(self): tokenlist = TokenList([ {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])}, {"form": "quick", "feats": Token([('Degree', 'Pos')])}, {"form": "brown", "feats": Token([('Degree', 'Pos')])}, {"form": "fox", "feats": Token([('Number', 'Sing')])}, ]) self.assertEqual( tokenlist.filter(feats__Degree="Pos").filter(form="brown"), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos"), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(id=0), TokenList([]) )
def test_parse_line_fewer_columns(self): line = "1\tThe\tthe\tDET\tDT" self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([ ('id', 1), ('form', 'The'), ('lemma', 'the'), ('upos', 'DET'), ('xpos', 'DT'), ]))
def test_parse_custom_fieldparsers(self): line = "1\t2" custom_fieldparsers = { "id": lambda line, i: line[i] * 5, } self.assertEqual( parse_line(line, fields=["id"], field_parsers=custom_fieldparsers), Token([ ('id', "11111"), ]))
def test_parse_CoNLL2009_1(self): data = dedent("""\ #\tid\tform\tlemma\tplemma\tpos\tppos\tfeats\tpfeats\thead\tphead\tdeprel\tpdeprel\tfillpred\tpred\tapreds 1\tZ\tz\tz\tR\tR\tSubPOS=R|Cas=2\tSubPOS=R|Cas=2\t10\t10\tAuxP\tAuxP\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_ 2\ttéto\ttento\ttento\tP\tP\tSubPOS=D|Gen=F|Num=S|Cas=2\tSubPOS=D|Gen=F|Num=S|Cas=2\t3\t3\tAtr\tAtr\tY\ttento\t_\tRSTR\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_ 3\tknihy\tkniha\tkniha\tN\tN\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\t1\t1\tAdv\tAdv\tY\tkniha\t_\t_\t_\t_\t_\t_\t_\tDIR1\t_\t_\t_\t_\t_\t_\t_\t_ """) sentences = parse( data, fields=('id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats', 'pfeats', 'head', 'phead', 'deprel', 'pdeprel', 'fillpred', 'pred', 'apreds'), field_parsers={ "pfeats": lambda line, i: parse_dict_value(line[i]), "phead": lambda line, i: parse_int_value(line[i]), "apreds": lambda line, i: [ apred_field if apred_field != "_" else None for apred_field in line[i:len(line)] ], }, ) self.assertEqual( sentences[0][2], Token([('id', 3), ('form', 'knihy'), ('lemma', 'kniha'), ('plemma', 'kniha'), ('pos', 'N'), ('ppos', 'N'), ('feats', Token([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'), ('Cas', '2'), ('Neg', 'A')])), ('pfeats', Token([('SubPOS', 'N'), ('Gen', 'F'), ('Num', 'S'), ('Cas', '2'), ('Neg', 'A')])), ('head', 1), ('phead', 1), ('deprel', 'Adv'), ('pdeprel', 'Adv'), ('fillpred', 'Y'), ('pred', 'kniha'), ('apreds', [ None, None, None, None, None, None, None, 'DIR1', None, None, None, None, None, None, None, None ])]))
def test_parse_dict_value(self): self.assertEqual( parse_dict_value("key1"), Token([("key1", "")]) ) self.assertEqual( parse_dict_value("key1=val1"), Token([("key1", "val1")]) ) self.assertEqual( parse_dict_value("key1=val1|key2=val2"), Token([("key1", "val1"), ("key2", "val2")]) ) self.assertEqual( parse_dict_value("key1=val1|key2|key3=val3"), Token([("key1", "val1"), ("key2", ""), ("key3", "val3")]) ) self.assertEqual( parse_dict_value("key1=val1|key1=val2"), Token([("key1", "val2")]) ) self.assertEqual( parse_dict_value("key1=_|_|_=val1"), Token([("key1", None)]) ) self.assertEqual(parse_dict_value(""), None) self.assertEqual(parse_dict_value("_"), None)
def test_custom_field_parsers(self): data = dedent("""\ 1\tbackwards\tline 2\tparis\tsirap """) fields = ("id", "backwards") # A field parser that takes all remaining field, reverses their letters and joins them def parse_backwards(value): return " ".join([part[::-1] for part in value]) # This overrides the default parsers, so the id is parsed as a string field_parsers = { "id": lambda line, i: line[i], "backwards": lambda line, i: parse_backwards(line[i:len(line)]) } tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers) self.assertEqual(tokens, [ Token([("id", '1'), ("backwards", "sdrawkcab enil")]), Token([("id", '2'), ("backwards", "sirap paris")]), ])
def test_parse_fieldparsers_alias_two_ways(self): line = "1\t2" custom_fieldparsers = { "xpos": lambda line, i: line[i] * 5, "upostag": lambda line, i: line[i] * 5, } self.assertEqual( parse_line(line, fields=["xpostag", "upos"], field_parsers=custom_fieldparsers), Token([ ('xpostag', "11111"), ('upos', "22222"), ]) )