def test_from_dict(self): s = '(root (1 some-thing -1 -1 -1 ("a")))' d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-thing', 'form': 'a' } ] } assert D.from_dict(d) == D.from_string(s) s = (r'(root (1 ^some-thing@some-type -1 -1 -1 ("a b"' r' 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]")))' ) d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-thing', 'type': 'some-type', 'head': True, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } ] } assert D.from_dict(d) == D.from_string(s)
def test_str(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert str(D.from_string(s)) == s s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 1 "token [ +FORM \"a\" ]")) ' r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))') assert str(D.from_string(s)) == s
def test_eq(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') # identity b = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a == b # ids and scores don't matter b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))') assert a == b # tokens matter b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))') assert a != b # and type of rhs assert a != '(1 some-type -1 -1 -1 ("token"))' # and tokenization b = D.from_string('(1 some-type -1 2 7 ("token"))') assert a != b # and of course entities b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))') assert a != b # and number of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))') b = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b # and order of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = D.from_string('(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))') assert a != b
def test_str(self): s = '(1 some-type -1 -1 -1 ("token"))' assert str(D.from_string(s)) == s s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 1 "token [ +FORM \"a\" ]")) ' r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))') assert str(D.from_string(s)) == s
def test_to_udf(self): s = '(1 some-type -1 -1 -1 ("token"))' assert D.from_string(s).to_udf(indent=None) == s assert D.from_string(s).to_udf(indent=1) == ( '(1 some-type -1 -1 -1\n' ' ("token"))' ) s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 3 "token [ +FORM \"a\" ]")) ' r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-type 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a"\n' ' 3 "token [ +FORM \\"a\\" ]"))\n' ' (4 bcd-lex 0.5 2 5\n' ' ("bcd"\n' ' 5 "token [ +FORM \\"bcd\\" ]"))))' ) s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-type 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' )
def test_from_dict(self): s = '(root (1 some-type -1 -1 -1 ("a")))' d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-type', 'form': 'a' } ] } assert D.from_dict(d) == D.from_string(s) s = ( r'(root (1 some-type -1 -1 -1 ("a b"' r' 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]")))' ) d = { 'entity': 'root', 'daughters': [ { 'id': 1, 'entity': 'some-type', 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } ] } assert D.from_dict(d) == D.from_string(s)
def test_eq(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') # identity b = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a == b # ids and scores don't matter b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))') assert a == b # tokens matter b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))') assert a != b # and type of rhs assert a != '(1 some-type -1 -1 -1 ("token"))' # and tokenization b = D.from_string('(1 some-type -1 2 7 ("token"))') assert a != b # and of course entities b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))') assert a != b # and number of children a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))') b = D.from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') assert a != b # and order of children a = D.from_string( '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))') b = D.from_string( '(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))') assert a != b
def test_lexical_type(self): # NOTE: this returns None for standard UDF or non-preterminals a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))' ' (2 b-type -1 -1 -1 ("b")))') assert a.lexical_type() == None assert a.daughters[0].lexical_type() == None assert a.daughters[1].lexical_type() == None a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))' ' (2 b-type@b-type_le -1 -1 -1 ("b")))') assert a.lexical_type() == None assert a.daughters[0].lexical_type() == 'a-type_le' assert a.daughters[1].lexical_type() == 'b-type_le'
def derivation(self): """ Deserialize and return a Derivation object for UDF- or JSON-formatted derivation data; otherwise return the original string. """ drv = self.get('derivation') if drv is not None: if isinstance(drv, dict): drv = Derivation.from_dict(drv) elif isinstance(drv, stringtypes): drv = Derivation.from_string(drv) return drv
def test_basic_entity(self): # this works for both UDX and standard UDF a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))' ' (2 b-type -1 -1 -1 ("b")))') assert a.basic_entity() == 'root' assert a.daughters[0].basic_entity() == 'a-type' assert a.daughters[1].basic_entity() == 'b-type' a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))' ' (2 b-type@b-type_le -1 -1 -1 ("b")))') assert a.basic_entity() == 'root' assert a.daughters[0].entity == 'a-type@a-type_le' assert a.daughters[0].basic_entity() == 'a-type' assert a.daughters[1].entity == 'b-type@b-type_le' assert a.daughters[1].basic_entity() == 'b-type'
def test_terminals(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert [t.form for t in a.terminals()] == ['a', 'b'] a = D.from_string('(root' ' (1 some-thing@some-type 0.4 0 5' ' (2 a-lex@a-type 0.8 0 1' ' ("a b"' ' 3 "token [ +FORM \\"a\\" ]"' ' 4 "token [ +FORM \\"b\\" ]"))' ' (5 b-lex@b-type 0.9 1 2' ' ("b"' ' 6 "token [ +FORM \\"b\\" ]"))))') assert [t.form for t in a.terminals()] == ['a b', 'b']
def test_is_head(self): # NOTE: is_head() is undefined for standard UDF without the # head marker ^ a = D.from_string('(root (1 some-type -1 -1 -1 ("a"))' ' (2 ^some-type -1 -1 -1 ("b")))') assert a.daughters[0].is_head() == False assert a.daughters[1].is_head() == True
def test_entity(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing' a = D.from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.entity == 'root' node = a.daughters[0] assert node.entity == 'some-thing' assert node.daughters[0].entity == 'a-thing' assert node.daughters[1].entity == 'b-thing'
def test_type(self): a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') assert a.type == None node = a.daughters[0] assert node.type == None assert node.daughters[0].type == None assert node.daughters[1].type == None a = D.from_string('(root (1 some-thing@some-type -1 -1 -1' ' (2 a-thing@a-type -1 -1 -1 ("a"))' ' (3 b-thing@b-type -1 -1 -1 ("b"))))') assert a.type == None node = a.daughters[0] assert node.type == 'some-type' assert node.daughters[0].type == 'a-type' assert node.daughters[1].type == 'b-type'
def test_lexical_type(self): # NOTE: this returns None for standard UDF or non-preterminals a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.lexical_type() == None node = a.daughters[0] assert node.daughters[0].lexical_type() == None assert node.daughters[1].lexical_type() == None a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing@a-type_le -1 -1 -1 ("a"))' ' (3 b-thing@b-type_le -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.lexical_type() == None node = a.daughters[0] assert node.daughters[0].lexical_type() == 'a-type_le' assert node.daughters[1].lexical_type() == 'b-type_le'
def preprocess(inp, derivation): derivation = Derivation.from_string(derivation) tokens = get_tokens(derivation) traces = [i[1] for i in tokens] sent = [] for token, trace in tokens: lemma = trace.split('/')[-3] #native entry if not lemma.startswith('generic'): to = int(re.search(r'\+TO .*?\\"(\d+)\\"', token.tfs).group(1)) fro = int(re.search(r'\+FROM .*?\\"(\d+)\\"', token.tfs).group(1)) form = inp[fro:to] else: form = lemma #add punctuation if 'comma' in trace: form = '%s,' % form if 'asterisk_' in trace: form = '%s*' % form if 'asterisk-pre' in trace: form = '*%s' % form if 'threedot' in trace: form = '%s...' % form if 'hyphen' in trace: form = '%s-' % form if 'sqright' in trace: form = '%s\'' % form if 'sqleft' in trace: form = '\'%s' % form if 'dqright' in trace: form = '%s\'' % form if 'dqleft' in trace: form = '\'%s' % form if 'rparen' in trace: form = '%s)' % form if 'lparen' in trace: form = '(%s' % form if 'comma-rp' in trace: form = '%s,)' % form if 'bang' in trace: form = '%s!' % form if 'qmark' in trace: form = '%s?' % form if 'qmark-bang' in trace: form = '%s?!' % form if 'period' in trace: form = '%s.' % form #fix compounds if '-' in form and form[-1] != '-': form = form.split('-')[1] sent.append(form) return ' '.join(sent)
def test_basic_entity(self): # this works for both UDX and standard UDF a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing -1 -1 -1 ("a"))' ' (3 b-thing -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.basic_entity() == 'root' node = a.daughters[0] assert node.daughters[0].basic_entity() == 'a-thing' assert node.daughters[1].basic_entity() == 'b-thing' a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 a-thing@a-type_le -1 -1 -1 ("a"))' ' (3 b-thing@b-type_le -1 -1 -1 ("b"))))') with pytest.warns(DeprecationWarning): assert a.basic_entity() == 'root' node = a.daughters[0] assert node.basic_entity() == 'some-thing' assert node.daughters[0].basic_entity() == 'a-thing' assert node.daughters[1].basic_entity() == 'b-thing'
def test_to_udx(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_udx(indent=None) == s s = (r'(root (1 some-thing@some-type 0.4 0 5 ' r'(2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]")) ' r'(5 b-lex@b-type 0.9 1 2 ' r'("b" 6 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udx(indent=1) == ( '(root\n' ' (1 some-thing@some-type 0.4 0 5\n' ' (2 a-lex@a-type 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))\n' ' (5 b-lex@b-type 0.9 1 2\n' ' ("b"\n' ' 6 "token [ +FORM \\"b\\" ]"))))' )
def test_is_head(self): # NOTE: is_head() is undefined for nodes with multiple # siblings, none of which are marked head (e.g. in plain UDF) a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 some-thing -1 -1 -1 ("b"))))') assert a.is_head() == True node = a.daughters[0] assert node.is_head() == True assert node.daughters[0].is_head() == None assert node.daughters[1].is_head() == None # if one sibling is marked, all become decidable a = D.from_string('(root (1 some-thing -1 -1 -1' ' (2 some-thing -1 -1 -1 ("a"))' ' (3 ^some-thing -1 -1 -1 ("b"))))') assert a.is_head() == True node = a.daughters[0] assert node.is_head() == True assert node.daughters[0].is_head() == False assert node.daughters[1].is_head() == True
def test_to_udf(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_udf(indent=None) == s assert D.from_string(s).to_udf(indent=1) == ( '(1 some-thing -1 -1 -1\n' ' ("token"))' ) s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a" 3 "token [ +FORM \"a\" ]")) ' r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a"\n' ' 3 "token [ +FORM \\"a\\" ]"))\n' ' (4 bcd-lex 0.5 2 5\n' ' ("bcd"\n' ' 5 "token [ +FORM \\"bcd\\" ]"))))' ) s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' ) s = (r'(root (1 some-thing@some-type 0.4 0 5 (2 a-lex@a-type 0.8 0 1 ' r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))') assert D.from_string(s).to_udf(indent=1) == ( '(root\n' ' (1 some-thing 0.4 0 5\n' ' (2 a-lex 0.8 0 1\n' ' ("a b"\n' ' 3 "token [ +FORM \\"a\\" ]"\n' ' 4 "token [ +FORM \\"b\\" ]"))))' )
def test_to_dict(self): s = '(1 some-type -1 -1 -1 ("token"))' assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'some-type', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'token' } fields = ('id', 'entity', 'score') # daughters and form are always shown assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'some-type', 'score': -1.0, 'form': 'token' } s = ( r'(1 a-lex -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]"))' ) assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] } assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'form': 'a b' }
def parse_spans(span_lines, derivation_str): regex = re.compile(r"\((\d+), \d+, \d+, <(\d+):(\d+)>") c_spans = {} for line in span_lines.split("\n"): m = regex.search(line) if not m: continue key, start, end = m.groups() c_spans[int(key)] = (int(start), int(end)) derivation = Derivation.from_string(derivation_str) # type: Derivation # return [c_spans[j.id] for i in derivation.terminals() # for j in i.tokens] return [(c_spans[i.tokens[0].id][0], c_spans[i.tokens[-1].id][1]) for i in derivation.terminals()]
def prof_entries(prof, typemap, lexmap, table='result', cols=('derivation', 'mrs')): p = itsdb.ItsdbProfile(prof) seen = set() for derivation, mrs in p.select(table, cols): d = Derivation.from_string(derivation) for entity, typ, form in _derivation_les(d): if typ is None: typ = lexmap.get(entity) orth = ', '.join('"{}"'.format(part) for part in form) if (typ, orth) not in seen and typ in typemap: supertype = typemap[typ][0] # more than 1? lename = '+'.join(form) + '-' + supertype pred = None print(lename, supertype, orth, pred, None) yield (lename, supertype, orth, pred, None) seen.add((typ, orth))
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # LKB-style terminals t = D.from_string('''(1 some-type -1 -1 -1 ("to ken" 1 2))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('to ken')] # start/end ignored # TFS-style terminals t = D.from_string(r'''(1 some-type -1 -1 -1 ("to ken" 2 "token [ +FORM \"to\" ]" 3 "token [ +FORM \"ken\" ]"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [ T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'), Tk(3, r'token [ +FORM \"ken\" ]')]) ] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [T('bcd', [Tk(2, r'token [ +FORM \"bcd\" ]')])]
def test_to_dict(self): s = '(1 some-thing -1 -1 -1 ("token"))' assert D.from_string(s).to_dict() == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'token' } fields = ('id', 'entity', 'score') # daughters and form are always shown assert D.from_string(s).to_dict(fields=fields) == { 'id': 1, 'entity': 'some-thing', 'score': -1.0, 'form': 'token' } s = (r'(root (0 top@top-rule -1 -1 -1' r' (1 a-lex@a-type -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"' r' 3 "token [ +FORM \"b\" ]"))' r' (4 ^c-lex@c-type -1 -1 -1 ("c" 5 "token [ +FORM \"c\" ]"))))') assert D.from_string(s).to_dict() == { 'entity': 'root', 'daughters': [ { 'id': 0, 'entity': 'top', 'type': 'top-rule', 'score': -1.0, 'start': -1, 'end': -1, 'daughters': [ { 'id': 1, 'entity': 'a-lex', 'type': 'a-type', 'score': -1.0, 'start': -1, 'end': -1, 'form': 'a b', 'tokens': [ {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'}, {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'} ] }, { 'id': 4, 'entity': 'c-lex', 'type': 'c-type', 'head': True, 'score': -1.0, 'start': -1, 'end': -1, 'form': 'c', 'tokens': [ {'id': 5, 'tfs': r'token [ +FORM \"c\" ]'} ] } ] } ] } assert D.from_string(s).to_dict(fields=fields) == { 'entity': 'root', 'daughters': [ { 'id': 0, 'entity': 'top', 'score': -1.0, 'daughters': [ { 'id': 1, 'entity': 'a-lex', 'score': -1.0, 'form': 'a b' }, { 'id': 4, 'entity': 'c-lex', 'score': -1.0, 'form': 'c' } ] } ] }
def test_is_root(self): a = D.from_string('(1 some-thing -1 -1 -1 ("token"))') assert a.is_root() == False a = D.from_string('(root (1 some-thing -1 -1 -1 ("token")))') assert a.is_root() == True assert a.daughters[0].is_root() == False
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"',)] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"',)] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-type -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-type -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"', )] # newlines in tree t = D.from_string('''(1 some-type -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-type' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [('"token"', )] # longer example t = D.from_string(r'''(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-type' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
def test_is_root(self): a = D.from_string('(1 some-type -1 -1 -1 ("token"))') assert a.is_root() == False a = D.from_string('(root (1 some-type -1 -1 -1 ("token")))') assert a.is_root() == True assert a.daughters[0].is_root() == False
def test_ParseResult(): r = ParseResult() assert len(r) == 0 assert r.mrs() is None assert r.dmrs() is None assert r.eds() is None assert r.derivation() is None mrs_s = '[ TOP: h0 RELS: < ["_rain_v_1_rel" LBL: h1 ARG0: e2 ] > HCONS: < h0 qeq h1 > ]' mrs_d = { 'top': 'h0', 'relations': [ { 'predicate': '_rain_v_1', 'label': 'h1', 'arguments': {'ARG0': 'e2'} } ], 'constraints': [ {'relation': 'qeq', 'high': 'h0', 'low': 'h1'} ] } mrs = simplemrs.loads_one(mrs_s) r = ParseResult(mrs=mrs_s) assert len(r) == 1 assert r['mrs'] == mrs_s assert r.mrs() == mrs r = ParseResult(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs r = ParseResult(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs # r = ParseResult(mrs='nonsense') # assert r['mrs'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.mrs() dmrs_d = { 'nodes': [ {'nodeid': 10000, 'predicate': '_rain_v_1', 'sortinfo': {'cvarsort': 'e'}} ], 'links': [ {'from': 0, 'to': 10000, 'rargname': None, 'post': 'H'} ] } dmrs = Dmrs.from_dict(dmrs_d) r = ParseResult(dmrs=dmrs_d) assert len(r) == 1 assert r['dmrs'] == dmrs_d assert r.dmrs() == dmrs # r = ParseResult(dmrs='nonsense') # assert len(r) == 1 # assert r['dmrs'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.dmrs() eds_d = { 'top': 'e2', 'nodes': { 'e2': { 'label': '_rain_v_1', 'lnk': {'from': 3, 'to': 9}, 'edges': {} } } } eds_s = '{e2: e2:_rain_v_1<3:9>[]}' eds = Eds.from_dict(eds_d) r = ParseResult(eds=eds_s) assert len(r) == 1 assert r['eds'] == eds_s assert r.eds() == eds r = ParseResult(eds=eds_d) assert len(r) == 1 assert r['eds'] == eds_d assert r.eds() == eds # r = ParseResult(eds='nonsense') # assert len(r) == 1 # assert r['eds'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.eds() # several changes were made to the below for compatibility: # - removed head annotation (on W_PERIOD_PLR) # - removed type info # - removed from/to info # - added start/end # - escaped quotes # - capitalized entity names deriv_s = '(189 SB-HD_MC_C 0.228699 0 2 (37 it 0.401245 0 1 ("it" 34 "token [ +FORM \\"it\\" +FROM #1=\\"0\\" +TO \\"2\\" ]")) (188 W_PERIOD_PLR -0.113641 1 2 (187 V_PST_OLR 0 1 2 (56 rain_v1 0 1 2 ("rained." 32 "token [ +FORM \\"rained.\\" +FROM #1=\\"3\\" +TO \\"10\\" ]")))))' deriv_d = { "id": 189, "entity": "SB-HD_MC_C", "label": "S", "score": 0.228699, "start": 0, "end": 2, "daughters": [ # , "type": "subjh_mc_rule" {"id": 37, "entity": "it", "score": 0.401245, "start": 0, "end": 1, "form": "it", "tokens": [ # , "type": "n_-_pr-it-x_le" , "from": 0, "to": 2 {"id": 34, "tfs": "token [ +FORM \\\"it\\\" +FROM #1=\\\"0\\\" +TO \\\"2\\\" ]"}]}, # , "from": 0, "to": 2 {"id": 188, "entity": "W_PERIOD_PLR", "score": -0.113641, "start": 1, "end": 2, "daughters": [ # , "type": "punctuation_period_rule" {"id": 187, "entity": "V_PST_OLR", "score": 0, "start": 1, "end": 2, "daughters": [ # , "type": "v_pst_inflrule" {"id": 56, "entity": "rain_v1", "score": 0, "start": 1, "end": 2, "form": "rained.", "tokens": [ # , "type": "v_-_it_le", "from": 3, "to": 10 {"id": 32, "tfs": "token [ +FORM \\\"rained.\\\" +FROM #1=\\\"3\\\" +TO \\\"10\\\" ]"}]}]}]}] # , "from": 3, "to": 10 } deriv = Derivation.from_dict(deriv_d) r = ParseResult(derivation=deriv_s) assert len(r) == 1 assert r['derivation'] == deriv_s assert r.derivation() == deriv r = ParseResult(derivation=deriv_d) assert len(r) == 1 assert r['derivation'] == deriv_d assert r.derivation() == deriv
def test_ParseResult(): r = ParseResult() assert len(r) == 0 assert r.mrs() is None assert r.dmrs() is None assert r.eds() is None assert r.derivation() is None mrs_s = '[ TOP: h0 RELS: < ["_rain_v_1_rel" LBL: h1 ARG0: e2 ] > HCONS: < h0 qeq h1 > ]' mrs_d = { 'top': 'h0', 'relations': [{ 'predicate': '_rain_v_1', 'label': 'h1', 'arguments': { 'ARG0': 'e2' } }], 'constraints': [{ 'relation': 'qeq', 'high': 'h0', 'low': 'h1' }] } mrs = simplemrs.loads_one(mrs_s) r = ParseResult(mrs=mrs_s) assert len(r) == 1 assert r['mrs'] == mrs_s assert r.mrs() == mrs r = ParseResult(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs r = ParseResult(mrs=mrs_d) assert len(r) == 1 assert r['mrs'] == mrs_d assert r.mrs() == mrs # r = ParseResult(mrs='nonsense') # assert r['mrs'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.mrs() dmrs_d = { 'nodes': [{ 'nodeid': 10000, 'predicate': '_rain_v_1', 'sortinfo': { 'cvarsort': 'e' } }], 'links': [{ 'from': 0, 'to': 10000, 'rargname': None, 'post': 'H' }] } dmrs = Dmrs.from_dict(dmrs_d) r = ParseResult(dmrs=dmrs_d) assert len(r) == 1 assert r['dmrs'] == dmrs_d assert r.dmrs() == dmrs # r = ParseResult(dmrs='nonsense') # assert len(r) == 1 # assert r['dmrs'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.dmrs() eds_d = { 'top': 'e2', 'nodes': { 'e2': { 'label': '_rain_v_1', 'lnk': { 'from': 3, 'to': 9 }, 'edges': {} } } } eds_s = '{e2: e2:_rain_v_1<3:9>[]}' eds = Eds.from_dict(eds_d) r = ParseResult(eds=eds_s) assert len(r) == 1 assert r['eds'] == eds_s assert r.eds() == eds r = ParseResult(eds=eds_d) assert len(r) == 1 assert r['eds'] == eds_d assert r.eds() == eds # r = ParseResult(eds='nonsense') # assert len(r) == 1 # assert r['eds'] == 'nonsense' # with pytest.raises(XmrsDeserializationError): # r.eds() # several changes were made to the below for compatibility: # - removed head annotation (on W_PERIOD_PLR) # - removed type info # - removed from/to info # - added start/end # - escaped quotes # - capitalized entity names deriv_s = '(189 SB-HD_MC_C 0.228699 0 2 (37 it 0.401245 0 1 ("it" 34 "token [ +FORM \\"it\\" +FROM #1=\\"0\\" +TO \\"2\\" ]")) (188 W_PERIOD_PLR -0.113641 1 2 (187 V_PST_OLR 0 1 2 (56 rain_v1 0 1 2 ("rained." 32 "token [ +FORM \\"rained.\\" +FROM #1=\\"3\\" +TO \\"10\\" ]")))))' deriv_d = { "id": 189, "entity": "SB-HD_MC_C", "label": "S", "score": 0.228699, "start": 0, "end": 2, "daughters": [ # , "type": "subjh_mc_rule" { "id": 37, "entity": "it", "score": 0.401245, "start": 0, "end": 1, "form": "it", "tokens": [ # , "type": "n_-_pr-it-x_le" , "from": 0, "to": 2 { "id": 34, "tfs": "token [ +FORM \\\"it\\\" +FROM #1=\\\"0\\\" +TO \\\"2\\\" ]" } ] }, # , "from": 0, "to": 2 { "id": 188, "entity": "W_PERIOD_PLR", "score": -0.113641, "start": 1, "end": 2, "daughters": [ # , "type": "punctuation_period_rule" { "id": 187, "entity": "V_PST_OLR", "score": 0, "start": 1, "end": 2, "daughters": [ # , "type": "v_pst_inflrule" { "id": 56, "entity": "rain_v1", "score": 0, "start": 1, "end": 2, "form": "rained.", "tokens": [ # , "type": "v_-_it_le", "from": 3, "to": 10 { "id": 32, "tfs": "token [ +FORM \\\"rained.\\\" +FROM #1=\\\"3\\\" +TO \\\"10\\\" ]" } ] } ] } ] } ] # , "from": 3, "to": 10 } deriv = Derivation.from_dict(deriv_d) r = ParseResult(derivation=deriv_s) assert len(r) == 1 assert r['derivation'] == deriv_s assert r.derivation() == deriv r = ParseResult(derivation=deriv_d) assert len(r) == 1 assert r['derivation'] == deriv_d assert r.derivation() == deriv
def test_fromstring(self): with pytest.raises(ValueError): D.from_string('') # root with no children with pytest.raises(ValueError): D.from_string('(some-root)') # does not start with `(` or end with `)` with pytest.raises(ValueError): D.from_string(' (1 some-thing -1 -1 -1 ("token"))') with pytest.raises(ValueError): D.from_string(' (1 some-thing -1 -1 -1 ("token")) ') # uneven parens with pytest.raises(ValueError): D.from_string('(1 some-thing -1 -1 -1 ("token")') # ok t = D.from_string('(1 some-thing -1 -1 -1 ("token"))') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # newlines in tree t = D.from_string('''(1 some-thing -1 -1 -1 ("token"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('token')] # LKB-style terminals t = D.from_string('''(1 some-thing -1 -1 -1 ("to ken" 1 2))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [T('to ken')] # start/end ignored # TFS-style terminals t = D.from_string(r'''(1 some-thing -1 -1 -1 ("to ken" 2 "token [ +FORM \"to\" ]" 3 "token [ +FORM \"ken\" ]"))''') assert t.id == 1 assert t.entity == 'some-thing' assert t.score == -1.0 assert t.start == -1 assert t.end == -1 assert t.daughters == [ T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'), Tk(3, r'token [ +FORM \"ken\" ]')]) ] # longer example t = D.from_string(r'''(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 ("a" 1 "token [ +FORM \"a\" ]")) (3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))) )''') assert t.entity == 'root' assert len(t.daughters) == 1 top = t.daughters[0] assert top.id == 1 assert top.entity == 'some-thing' assert top.score == 0.4 assert top.start == 0 assert top.end == 5 assert len(top.daughters) == 2 lex = top.daughters[0] assert lex.id == 2 assert lex.entity == 'a-lex' assert lex.score == 0.8 assert lex.start == 0 assert lex.end == 1 assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])] lex = top.daughters[1] assert lex.id == 3 assert lex.entity == 'bcd-lex' assert lex.score == 0.5 assert lex.start == 2 assert lex.end == 5 assert lex.daughters == [T('bcd', [Tk(2, r'token [ +FORM \"bcd\" ]')])]