def test_no_root_nodes(self): tokenlist = TokenList([ OrderedDict([('id', 1), ('form', 'To'), ('head', 1)]), OrderedDict([('id', 2), ('form', 'appear'), ('head', 2)]), ]) with self.assertRaises(ParseException): tokenlist.to_tree()
def test_nested_filtering(self): tokenlist = TokenList([ {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])}, {"form": "quick", "feats": Token([('Degree', 'Pos')])}, {"form": "brown", "feats": Token([('Degree', 'Pos')])}, {"form": "fox", "feats": Token([('Number', 'Sing')])}, ]) self.assertEqual( tokenlist.filter(feats__Degree="Pos").filter(form="brown"), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos"), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(), TokenList([ {"form": "brown", "feats": Token([('Degree', 'Pos')])}, ]) ) self.assertEqual( tokenlist.filter(form="brown").filter(feats__Degree="Pos").filter(id=0), TokenList([]) )
def test_multiple_root_nodes(self): tokenlist = TokenList([ Token([('id', 1), ('form', 'To'), ('head', 0)]), Token([('id', 2), ('form', 'appear'), ('head', 1)]), Token([('id', 4), ('form', 'EMNLP'), ('head', 0)]), Token([('id', 5), ('form', '2014'), ('head', 4)]), Token([('id', 6), ('form', 'Yay!'), ('head', 0)]), ]) tree = TokenTree( token=Token([("id", 0), ("form", "_"), ("deprel", "root")]), children=[ TokenTree( token=Token([("id", 1), ("form", "To"), ("head", 0)]), children=[TokenTree( token=Token([("id", 2), ("form", "appear"), ("head", 1)]), children=[] )] ), TokenTree( token=Token([("id", 4), ("form", "EMNLP"), ("head", 0)]), children=[TokenTree( token=Token([("id", 5), ("form", "2014"), ("head", 4)]), children=[] )] ), TokenTree( token=Token([("id", 6), ("form", "Yay!"), ("head", 0)]), children=[] ), ] ) self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_multiple_root_nodes(self): tokenlist = TokenList([ OrderedDict([('id', 1), ('form', 'To'), ('head', 0)]), OrderedDict([('id', 2), ('form', 'appear'), ('head', 1)]), OrderedDict([('id', 4), ('form', 'EMNLP'), ('head', 0)]), OrderedDict([('id', 5), ('form', '2014'), ('head', 4)]), ]) with self.assertRaises(ParseException): tokenlist.to_tree()
def test_extend_with_dict_list(self): tokenlist = TokenList([{"id": 1}]) tokenlist.extend([{"id": 2}, {"id": 3}]) self.assertEqual(tokenlist, TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }]))
def test_parse_tree_and_serialize(self): from tests.fixtures import TESTCASES for testcase in TESTCASES: data = parse(testcase) testcase_without_range_and_elided = TokenList( [token for token in data[0] if isinstance(token["id"], int)], metadata=data[0].metadata) self.assertEqual( parse_tree(testcase)[0].serialize(), testcase_without_range_and_elided.serialize())
def test_clear(self): tokenlist = TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }], {"meta": "data"}) tokenlist.clear() self.assertEqual(len(tokenlist.tokens), 0) self.assertEqual(tokenlist.metadata, None)
def test_copy(self): tokenlist1 = TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }], {"meta": "data"}) tokenlist2 = tokenlist1.copy() self.assertIsNot(tokenlist1, tokenlist2) self.assertEqual(tokenlist1, tokenlist2)
def test_simple_tree(self): tokenlist = TokenList([ Token([("id", 2), ("form", "dog"), ("head", 0)]), Token([("id", 1), ("form", "a"), ("head", 2)]), ]) tree = TokenTree(token=Token([("id", 2), ("form", "dog"), ("head", 0)]), children=[ TokenTree(token=Token([("id", 1), ("form", "a"), ("head", 2)]), children=[]) ]) self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_to_tree(self): tokenlist = TokenList([ OrderedDict([("id", 2), ("form", "dog"), ("head", 0)]), OrderedDict([("id", 1), ("form", "a"), ("head", 2)]), ]) tree = TokenTree( token=OrderedDict([("id", 2), ("form", "dog"), ("head", 0)]), children=[TokenTree( token=OrderedDict([("id", 1), ("form", "a"), ("head", 2)]), children=[] )] ) self.assertEqual(tokenlist.to_tree(), tree)
def test_removes_negative_nodes(self): tokenlist = TokenList([ Token([("id", 2), ("form", "dog"), ("head", 0)]), Token([("id", 1), ("form", "a"), ("head", 2)]), Token([("id", 3), ("form", "😍"), ("head", -1)]), ]) tree = TokenTree(token=Token([("id", 2), ("form", "dog"), ("head", 0)]), children=[ TokenTree(token=Token([("id", 1), ("form", "a"), ("head", 2)]), children=[]) ]) self.assertTreeEqual(tokenlist.to_tree(), tree)
def test_remove(self): tokenlist = TokenList([{"id": 1}, {"id": 2}]) tokenlist.remove(Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 2}])) tokenlist.remove({"id": 2}) self.assertEqual(tokenlist, TokenList())
def test_insert(self): tokenlist = TokenList() tokenlist.insert(0, Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 1}])) tokenlist.insert(1, {"id": 2}) self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}])) self.assertEqual(type(tokenlist[1]), Token)
def test_append(self): tokenlist = TokenList() tokenlist.append(Token({"id": 1})) self.assertEqual(tokenlist, TokenList([{"id": 1}])) tokenlist.append({"id": 2}) self.assertEqual(tokenlist, TokenList([{"id": 1}, {"id": 2}])) self.assertEqual(type(tokenlist[1]), Token)
def compute_mst(distance_matrix: torch.Tensor, tokens: TokenList, ignore_punct=True) -> TokenTree: open_set = set([id_wrap(x) for x in tokens.copy()]) closed_set = set() if ignore_punct: open_set = open_set - set( list(filter(lambda x: is_punctuation(x.obj['form']), open_set))) treenodes = {} root = None while open_set: if not closed_set: token = open_set.pop().obj treenodes[token['id']] = root = TokenTree(token, []) closed_set.add(id_wrap(token)) continue grow_node_from = None grow_node_to = None grow_dist = np.inf for onode in open_set: onode = onode.obj for cnode in closed_set: cnode = cnode.obj dist = distance_matrix[onode['id'] - 1, cnode['id'] - 1] if dist < grow_dist: grow_dist = dist grow_node_from = cnode grow_node_to = onode treenodes[grow_node_to['id']] = node = TokenTree(grow_node_to, []) treenodes[grow_node_from['id']].children.append(node) closed_set.add(id_wrap(grow_node_to)) open_set.remove(id_wrap(grow_node_to)) return root
def test_extend_tokenlist_no_metadata_with_list(self): tokenlist1 = TokenList([{"id": 1}, {"id": 2}, {"id": 3}]) tokenlist2 = [{"id": 4}, {"id": 5}, {"id": 6}] tokenlist1.extend(tokenlist2) tokenlist3 = TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }, { "id": 4 }, { "id": 5 }, { "id": 6 }]) self.assertEqual(tokenlist1, tokenlist3)
def parse_single(data, fields=None, field_parsers=None): ''' parse single is the function which shall handle single conll sentence files. ''' l = data.read() return [ TokenList(*parse_token_and_metadata( l, fields=fields, field_parsers=field_parsers)) ]
def test_metadata(self): data = dedent("""\ # data = meta # meta = data 1\tdog """) tokenlist = TokenList(*parse_token_and_metadata(data)) self.assertEqual(serialize(tokenlist), data)
def parse(data, fields=None, field_parsers=None): ''' Here we assume a multiple sentence conll files conll files, however since we are handling single sentence conll files, we shall assume so, unless stated otherwise. For this we shall define a new function parse_single ''' return [ TokenList(*parse_token_and_metadata( sentence, fields=fields, field_parsers=field_parsers)) for sentence in data.read() if sentence ]
def test_and_filtering(self): tokenlist = TokenList([ { "id": 1, "form": "a", "field": "x" }, { "id": 2, "form": "dog", "field": "x" }, { "id": 3, "form": "dog", "field": "y" }, ]) self.assertEqual(tokenlist.filter(field="x", id=2), TokenList([ { "id": 2, "form": "dog", "field": "x" }, ])) self.assertEqual(tokenlist.filter(field="x", id=3), TokenList([]))
def test_extend_tokenlist_and_merge_metadata(self): tokenlist4 = TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }], {"meta1": "data1"}) tokenlist5 = TokenList([{ "id": 4 }, { "id": 5 }, { "id": 6 }], {"meta2": "data2"}) tokenlist4.extend(tokenlist5) tokenlist6 = TokenList([{ "id": 1 }, { "id": 2 }, { "id": 3 }, { "id": 4 }, { "id": 5 }, { "id": 6 }], { "meta1": "data1", "meta2": "data2" }) self.assertEqual(tokenlist4, tokenlist6)
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None): if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield TokenList( *parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers))
def test_eq(self): metadata = {"meta": "data"} tokenlist1 = TokenList([{"id": 1}]) tokenlist1.metadata = metadata tokenlist2 = TokenList([{"id": 1}]) self.assertNotEqual(tokenlist1, tokenlist2) tokenlist2.metadata = metadata self.assertEqual(tokenlist1, tokenlist2)
def test_eq(self): self.assertEqual(TokenList([{"id": 1}]), TokenList([{"id": 1}])) self.assertNotEqual(TokenList([{ "id": 1 }], metadata={"meta": "data"}), TokenList([{ "id": 1 }])) self.assertEqual(TokenList([{ "id": 1 }], metadata={"meta": "data"}), TokenList([{ "id": 1 }], metadata={"meta": "data"})) self.assertEqual(TokenList([{"id": 1}]), [{"id": 1}])
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None): if not hasattr(in_file, 'read'): raise FileNotFoundError( "Invalid file, 'parse_incr' needs an opened file as input") if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield TokenList( *parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers))
def test_lambda_deep_filtering(self): tokenlist = TokenList([ Token({ 'id': (1, '-', 2), 'feats': None }), Token({ 'id': 1, 'feats': { 'Case': 'Nom', 'Number': 'Sing' } }), Token({ 'id': 2, 'feats': { 'Mood': 'Ind', 'Number': 'Sing' } }) ]) self.assertEqual( tokenlist.filter(feats__Mood=lambda x: x == 'Ind'), TokenList( [Token({ 'id': 2, 'feats': { 'Mood': 'Ind', 'Number': 'Sing' } })])) self.assertEqual( tokenlist.filter(feats__Number=lambda x: x == 'Sing'), TokenList([ Token({ 'id': 1, 'feats': { 'Case': 'Nom', 'Number': 'Sing' } }), Token({ 'id': 2, 'feats': { 'Mood': 'Ind', 'Number': 'Sing' } }) ]))
def test_basic_filtering(self): tokenlist = TokenList([ {"id": 1, "form": "a", "field": "x"}, {"id": 2, "form": "dog", "field": "x"}, ]) self.assertEqual( tokenlist.filter(id=0), TokenList([]) ) self.assertEqual( tokenlist.filter(id=1), TokenList([{"id": 1, "form": "a", "field": "x"}]) ) self.assertEqual( tokenlist.filter(), tokenlist ) self.assertEqual( tokenlist.filter(field="x"), tokenlist )
def parse_incr( in_file: T.TextIO, fields: T.Optional[T.Sequence[str]] = None, field_parsers: T.Dict[str, _FieldParserType] = None, metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None ) -> T.Iterator[TokenList]: if not hasattr(in_file, 'read'): raise FileNotFoundError( "Invalid file, 'parse_incr' needs an opened file as input") if not fields: fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers) for sentence in parse_sentences(in_file): yield TokenList( *parse_token_and_metadata(sentence, fields=fields, field_parsers=field_parsers, metadata_parsers=metadata_parsers))
def parse_token_and_metadata( data: str, fields: T.Optional[T.Sequence[str]] = None, field_parsers: T.Optional[T.Dict[str, _FieldParserType]] = None, metadata_parsers: T.Optional[T.Dict[str, _MetadataParserType]] = None ) -> TokenList: if not data: raise ParseException( "Can't create TokenList, no data sent to constructor.") fields = fields or DEFAULT_FIELDS if not field_parsers: field_parsers = DEFAULT_FIELD_PARSERS.copy() elif sorted(field_parsers.keys()) != sorted(fields): new_field_parsers = DEFAULT_FIELD_PARSERS.copy() new_field_parsers.update(field_parsers) field_parsers = new_field_parsers tokens = [] metadata = Metadata() for line in data.split('\n'): line = line.strip() if not line: continue if line.startswith('#'): pairs = parse_comment_line(line, metadata_parsers=metadata_parsers) for key, value in pairs: metadata[key] = value else: tokens.append(parse_line(line, fields, field_parsers)) return TokenList(tokens, metadata, default_fields=fields)
def test_lambda_basic_filtering(self): tokenlist = TokenList([ Token({'id': (1, '-', 2), 'form': "It's", 'lemma': '_', 'feats': None}), Token({'id': 1, 'form': 'It', 'lemma': 'it'}), Token({'id': 2, 'form': "'s", 'lemma': 'be'}) ]) self.assertEqual( tokenlist.filter(id=lambda x: type(x) is int), TokenList([ Token({'id': 1, 'form': 'It', 'lemma': 'it'}), Token({'id': 2, 'form': "'s", 'lemma': 'be'}) ]) ) self.assertEqual( tokenlist.filter(lemma=lambda x: x.startswith('b')), TokenList([ Token({'id': 2, 'form': "'s", 'lemma': 'be'}) ]) )