Example #1
0
 def test_eq(self):
     a = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     # identity
     b = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert a == b
     # ids and scores don't matter
     b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))')
     assert a == b
     # tokens matter
     b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))')
     assert a != b
     # and type of rhs
     assert a != '(1 some-type -1 -1 -1 ("token"))'
     # and tokenization
     b = D.from_string('(1 some-type -1 2 7 ("token"))')
     assert a != b
     # and of course entities
     b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))')
     assert a != b
     # and number of children
     a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))')
     b = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))')
     assert a != b
     # and order of children
     a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))')
     b = D.from_string('(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))')
     assert a != b
Example #2
0
 def test_str(self):
     s = '(1 some-type -1 -1 -1 ("token"))'
     assert str(D.from_string(s)) == s
     s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a" 1 "token [ +FORM \"a\" ]")) '
          r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))')
     assert str(D.from_string(s)) == s
Example #3
0
 def test_str(self):
     s = '(1 some-thing -1 -1 -1 ("token"))'
     assert str(D.from_string(s)) == s
     s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a" 1 "token [ +FORM \"a\" ]")) '
          r'(3 bcd-lex 0.5 2 5 ("bcd" 2 "token [ +FORM \"bcd\" ]"))))')
     assert str(D.from_string(s)) == s
Example #4
0
 def test_from_dict(self):
     s = '(root (1 some-type -1 -1 -1 ("a")))'
     d = {
         'entity': 'root',
         'daughters': [
             {
                 'id': 1,
                 'entity': 'some-type',
                 'form': 'a'
             }
         ]
     }
     assert D.from_dict(d) == D.from_string(s)
     s = (   r'(root (1 some-type -1 -1 -1 ("a b"'
             r' 2 "token [ +FORM \"a\" ]"'
             r' 3 "token [ +FORM \"b\" ]")))' )
     d = {
         'entity': 'root',
         'daughters': [
             {
                 'id': 1,
                 'entity': 'some-type',
                 'form': 'a b',
                 'tokens': [
                     {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'},
                     {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'}
                 ]
             }
         ]
     }
     assert D.from_dict(d) == D.from_string(s)
Example #5
0
 def test_from_dict(self):
     s = '(root (1 some-thing -1 -1 -1 ("a")))'
     d = {
         'entity': 'root',
         'daughters': [
             {
                 'id': 1,
                 'entity': 'some-thing',
                 'form': 'a'
             }
         ]
     }
     assert D.from_dict(d) == D.from_string(s)
     s = (r'(root (1 ^some-thing@some-type -1 -1 -1 ("a b"'
          r' 2 "token [ +FORM \"a\" ]"'
          r' 3 "token [ +FORM \"b\" ]")))' )
     d = {
         'entity': 'root',
         'daughters': [
             {
                 'id': 1,
                 'entity': 'some-thing',
                 'type': 'some-type',
                 'head': True,
                 'form': 'a b',
                 'tokens': [
                     {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'},
                     {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'}
                 ]
             }
         ]
     }
     assert D.from_dict(d) == D.from_string(s)
Example #6
0
 def test_to_udf(self):
     s = '(1 some-type -1 -1 -1 ("token"))'
     assert D.from_string(s).to_udf(indent=None) == s
     assert D.from_string(s).to_udf(indent=1) == (
         '(1 some-type -1 -1 -1\n'
         ' ("token"))'
     )
     s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a" 3 "token [ +FORM \"a\" ]")) '
          r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))')
     assert D.from_string(s).to_udf(indent=1) == (
         '(root\n'
         ' (1 some-type 0.4 0 5\n'
         '  (2 a-lex 0.8 0 1\n'
         '   ("a"\n'
         '    3 "token [ +FORM \\"a\\" ]"))\n'
         '  (4 bcd-lex 0.5 2 5\n'
         '   ("bcd"\n'
         '    5 "token [ +FORM \\"bcd\\" ]"))))'
     )
     s = (r'(root (1 some-type 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))')
     assert D.from_string(s).to_udf(indent=1) == (
         '(root\n'
         ' (1 some-type 0.4 0 5\n'
         '  (2 a-lex 0.8 0 1\n'
         '   ("a b"\n'
         '    3 "token [ +FORM \\"a\\" ]"\n'
         '    4 "token [ +FORM \\"b\\" ]"))))'
     )
Example #7
0
 def test_eq(self):
     a = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     # identity
     b = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert a == b
     # ids and scores don't matter
     b = D.from_string('(100 some-type 0.114 -1 -1 ("token"))')
     assert a == b
     # tokens matter
     b = D.from_string('(1 some-type -1 -1 -1 ("nekot"))')
     assert a != b
     # and type of rhs
     assert a != '(1 some-type -1 -1 -1 ("token"))'
     # and tokenization
     b = D.from_string('(1 some-type -1 2 7 ("token"))')
     assert a != b
     # and of course entities
     b = D.from_string('(1 epyt-emos -1 -1 -1 ("token"))')
     assert a != b
     # and number of children
     a = D.from_string('(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")))')
     b = D.from_string(
         '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))')
     assert a != b
     # and order of children
     a = D.from_string(
         '(1 x -1 -1 -1 (2 y -1 -1 -1 ("y")) (3 z -1 -1 -1 ("z")))')
     b = D.from_string(
         '(1 x -1 -1 -1 (3 z -1 -1 -1 ("z")) (2 y -1 -1 -1 ("y")))')
     assert a != b
Example #8
0
 def test_lexical_type(self):
     # NOTE: this returns None for standard UDF or non-preterminals
     a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))'
                       '      (2 b-type -1 -1 -1 ("b")))')
     assert a.lexical_type() == None
     assert a.daughters[0].lexical_type() == None
     assert a.daughters[1].lexical_type() == None
     a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))'
                       '      (2 b-type@b-type_le -1 -1 -1 ("b")))')
     assert a.lexical_type() == None
     assert a.daughters[0].lexical_type() == 'a-type_le'
     assert a.daughters[1].lexical_type() == 'b-type_le'
Example #9
0
 def test_lexical_type(self):
     # NOTE: this returns None for standard UDF or non-preterminals
     a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))'
                       '      (2 b-type -1 -1 -1 ("b")))')
     assert a.lexical_type() == None
     assert a.daughters[0].lexical_type() == None
     assert a.daughters[1].lexical_type() == None
     a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))'
                       '      (2 b-type@b-type_le -1 -1 -1 ("b")))')
     assert a.lexical_type() == None
     assert a.daughters[0].lexical_type() == 'a-type_le'
     assert a.daughters[1].lexical_type() == 'b-type_le'
Example #10
0
 def test_basic_entity(self):
     # this works for both UDX and standard UDF
     a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))'
                       '      (2 b-type -1 -1 -1 ("b")))')
     assert a.basic_entity() == 'root'
     assert a.daughters[0].basic_entity() == 'a-type'
     assert a.daughters[1].basic_entity() == 'b-type'
     a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))'
                       '      (2 b-type@b-type_le -1 -1 -1 ("b")))')
     assert a.basic_entity() == 'root'
     assert a.daughters[0].entity == 'a-type@a-type_le'
     assert a.daughters[0].basic_entity() == 'a-type'
     assert a.daughters[1].entity == 'b-type@b-type_le'
     assert a.daughters[1].basic_entity() == 'b-type'
Example #11
0
 def test_basic_entity(self):
     # this works for both UDX and standard UDF
     a = D.from_string('(root (1 a-type -1 -1 -1 ("a"))'
                       '      (2 b-type -1 -1 -1 ("b")))')
     assert a.basic_entity() == 'root'
     assert a.daughters[0].basic_entity() == 'a-type'
     assert a.daughters[1].basic_entity() == 'b-type'
     a = D.from_string('(root (1 a-type@a-type_le -1 -1 -1 ("a"))'
                       '      (2 b-type@b-type_le -1 -1 -1 ("b")))')
     assert a.basic_entity() == 'root'
     assert a.daughters[0].entity == 'a-type@a-type_le'
     assert a.daughters[0].basic_entity() == 'a-type'
     assert a.daughters[1].entity == 'b-type@b-type_le'
     assert a.daughters[1].basic_entity() == 'b-type'
Example #12
0
 def test_terminals(self):
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing -1 -1 -1 ("a"))'
                       '  (3 b-thing -1 -1 -1 ("b"))))')
     assert [t.form for t in a.terminals()] == ['a', 'b']
     a = D.from_string('(root'
         ' (1 some-thing@some-type 0.4 0 5'
         '  (2 a-lex@a-type 0.8 0 1'
         '   ("a b"'
         '    3 "token [ +FORM \\"a\\" ]"'
         '    4 "token [ +FORM \\"b\\" ]"))'
         '  (5 b-lex@b-type 0.9 1 2'
         '   ("b"'
         '    6 "token [ +FORM \\"b\\" ]"))))')
     assert [t.form for t in a.terminals()] == ['a b', 'b']
Example #13
0
 def test_is_head(self):
     # NOTE: is_head() is undefined for standard UDF without the
     # head marker ^
     a = D.from_string('(root (1 some-type -1 -1 -1 ("a"))'
                       '      (2 ^some-type -1 -1 -1 ("b")))')
     assert a.daughters[0].is_head() == False
     assert a.daughters[1].is_head() == True
Example #14
0
 def test_is_head(self):
     # NOTE: is_head() is undefined for standard UDF without the
     # head marker ^
     a = D.from_string('(root (1 some-type -1 -1 -1 ("a"))'
                       '      (2 ^some-type -1 -1 -1 ("b")))')
     assert a.daughters[0].is_head() == False
     assert a.daughters[1].is_head() == True
Example #15
0
 def test_type(self):
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing -1 -1 -1 ("a"))'
                       '  (3 b-thing -1 -1 -1 ("b"))))')
     assert a.type == None
     node = a.daughters[0]
     assert node.type == None
     assert node.daughters[0].type == None
     assert node.daughters[1].type == None
     a = D.from_string('(root (1 some-thing@some-type -1 -1 -1'
                       '  (2 a-thing@a-type -1 -1 -1 ("a"))'
                       '  (3 b-thing@b-type -1 -1 -1 ("b"))))')
     assert a.type == None
     node = a.daughters[0]
     assert node.type == 'some-type'
     assert node.daughters[0].type == 'a-type'
     assert node.daughters[1].type == 'b-type'
Example #16
0
 def test_entity(self):
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing -1 -1 -1 ("a"))'
                       '  (3 b-thing -1 -1 -1 ("b"))))')
     assert a.entity == 'root'
     node = a.daughters[0]
     assert node.entity == 'some-thing'
     assert node.daughters[0].entity == 'a-thing'
     assert node.daughters[1].entity == 'b-thing'
     a = D.from_string('(root (1 some-thing@some-type -1 -1 -1'
                       '  (2 a-thing@a-type -1 -1 -1 ("a"))'
                       '  (3 b-thing@b-type -1 -1 -1 ("b"))))')
     assert a.entity == 'root'
     node = a.daughters[0]
     assert node.entity == 'some-thing'
     assert node.daughters[0].entity == 'a-thing'
     assert node.daughters[1].entity == 'b-thing'
Example #17
0
def preprocess(inp, derivation):
    derivation = Derivation.from_string(derivation)
    tokens = get_tokens(derivation)
    traces = [i[1] for i in tokens]

    sent = []
    for token, trace in tokens:
        lemma = trace.split('/')[-3]

        #native entry
        if not lemma.startswith('generic'):
            to = int(re.search(r'\+TO .*?\\"(\d+)\\"', token.tfs).group(1))
            fro = int(re.search(r'\+FROM .*?\\"(\d+)\\"', token.tfs).group(1))
            form = inp[fro:to]
        else:
            form = lemma

            #add punctuation
            if 'comma' in trace:
                form = '%s,' % form
            if 'asterisk_' in trace:
                form = '%s*' % form
            if 'asterisk-pre' in trace:
                form = '*%s' % form
            if 'threedot' in trace:
                form = '%s...' % form
            if 'hyphen' in trace:
                form = '%s-' % form
            if 'sqright' in trace:
                form = '%s\'' % form
            if 'sqleft' in trace:
                form = '\'%s' % form
            if 'dqright' in trace:
                form = '%s\'' % form
            if 'dqleft' in trace:
                form = '\'%s' % form
            if 'rparen' in trace:
                form = '%s)' % form
            if 'lparen' in trace:
                form = '(%s' % form
            if 'comma-rp' in trace:
                form = '%s,)' % form
            if 'bang' in trace:
                form = '%s!' % form
            if 'qmark' in trace:
                form = '%s?' % form
            if 'qmark-bang' in trace:
                form = '%s?!' % form
            if 'period' in trace:
                form = '%s.' % form

        #fix compounds
        if '-' in form and form[-1] != '-':
            form = form.split('-')[1]

        sent.append(form)
    return ' '.join(sent)
Example #18
0
 def test_lexical_type(self):
     # NOTE: this returns None for standard UDF or non-preterminals
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing -1 -1 -1 ("a"))'
                       '  (3 b-thing -1 -1 -1 ("b"))))')
     with pytest.warns(DeprecationWarning):
         assert a.lexical_type() == None
         node = a.daughters[0]
         assert node.daughters[0].lexical_type() == None
         assert node.daughters[1].lexical_type() == None
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing@a-type_le -1 -1 -1 ("a"))'
                       '  (3 b-thing@b-type_le -1 -1 -1 ("b"))))')
     with pytest.warns(DeprecationWarning):
         assert a.lexical_type() == None
         node = a.daughters[0]
         assert node.daughters[0].lexical_type() == 'a-type_le'
         assert node.daughters[1].lexical_type() == 'b-type_le'
Example #19
0
 def test_basic_entity(self):
     # this works for both UDX and standard UDF
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing -1 -1 -1 ("a"))'
                       '  (3 b-thing -1 -1 -1 ("b"))))')
     with pytest.warns(DeprecationWarning):
         assert a.basic_entity() == 'root'
         node = a.daughters[0]
         assert node.daughters[0].basic_entity() == 'a-thing'
         assert node.daughters[1].basic_entity() == 'b-thing'
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 a-thing@a-type_le -1 -1 -1 ("a"))'
                       '  (3 b-thing@b-type_le -1 -1 -1 ("b"))))')
     with pytest.warns(DeprecationWarning):
         assert a.basic_entity() == 'root'
         node = a.daughters[0]
         assert node.basic_entity() == 'some-thing'
         assert node.daughters[0].basic_entity() == 'a-thing'
         assert node.daughters[1].basic_entity() == 'b-thing'
Example #20
0
 def test_to_udx(self):
     s = '(1 some-thing -1 -1 -1 ("token"))'
     assert D.from_string(s).to_udx(indent=None) == s
     s = (r'(root (1 some-thing@some-type 0.4 0 5 '
          r'(2 a-lex@a-type 0.8 0 1 '
          r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]")) '
          r'(5 b-lex@b-type 0.9 1 2 '
          r'("b" 6 "token [ +FORM \"b\" ]"))))')
     assert D.from_string(s).to_udx(indent=1) == (
         '(root\n'
         ' (1 some-thing@some-type 0.4 0 5\n'
         '  (2 a-lex@a-type 0.8 0 1\n'
         '   ("a b"\n'
         '    3 "token [ +FORM \\"a\\" ]"\n'
         '    4 "token [ +FORM \\"b\\" ]"))\n'
         '  (5 b-lex@b-type 0.9 1 2\n'
         '   ("b"\n'
         '    6 "token [ +FORM \\"b\\" ]"))))'
     )
Example #21
0
 def test_is_head(self):
     # NOTE: is_head() is undefined for nodes with multiple
     # siblings, none of which are marked head (e.g. in plain UDF)
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 some-thing -1 -1 -1 ("a"))'
                       '  (3 some-thing -1 -1 -1 ("b"))))')
     assert a.is_head() == True
     node = a.daughters[0]
     assert node.is_head() == True
     assert node.daughters[0].is_head() == None
     assert node.daughters[1].is_head() == None
     # if one sibling is marked, all become decidable
     a = D.from_string('(root (1 some-thing -1 -1 -1'
                       '  (2 some-thing -1 -1 -1 ("a"))'
                       '  (3 ^some-thing -1 -1 -1 ("b"))))')
     assert a.is_head() == True
     node = a.daughters[0]
     assert node.is_head() == True
     assert node.daughters[0].is_head() == False
     assert node.daughters[1].is_head() == True
Example #22
0
 def test_to_udf(self):
     s = '(1 some-thing -1 -1 -1 ("token"))'
     assert D.from_string(s).to_udf(indent=None) == s
     assert D.from_string(s).to_udf(indent=1) == (
         '(1 some-thing -1 -1 -1\n'
         ' ("token"))'
     )
     s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a" 3 "token [ +FORM \"a\" ]")) '
          r'(4 bcd-lex 0.5 2 5 ("bcd" 5 "token [ +FORM \"bcd\" ]"))))')
     assert D.from_string(s).to_udf(indent=1) == (
         '(root\n'
         ' (1 some-thing 0.4 0 5\n'
         '  (2 a-lex 0.8 0 1\n'
         '   ("a"\n'
         '    3 "token [ +FORM \\"a\\" ]"))\n'
         '  (4 bcd-lex 0.5 2 5\n'
         '   ("bcd"\n'
         '    5 "token [ +FORM \\"bcd\\" ]"))))'
     )
     s = (r'(root (1 some-thing 0.4 0 5 (2 a-lex 0.8 0 1 '
          r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))')
     assert D.from_string(s).to_udf(indent=1) == (
         '(root\n'
         ' (1 some-thing 0.4 0 5\n'
         '  (2 a-lex 0.8 0 1\n'
         '   ("a b"\n'
         '    3 "token [ +FORM \\"a\\" ]"\n'
         '    4 "token [ +FORM \\"b\\" ]"))))'
     )
     s = (r'(root (1 some-thing@some-type 0.4 0 5 (2 a-lex@a-type 0.8 0 1 '
          r'("a b" 3 "token [ +FORM \"a\" ]" 4 "token [ +FORM \"b\" ]"))))')
     assert D.from_string(s).to_udf(indent=1) == (
         '(root\n'
         ' (1 some-thing 0.4 0 5\n'
         '  (2 a-lex 0.8 0 1\n'
         '   ("a b"\n'
         '    3 "token [ +FORM \\"a\\" ]"\n'
         '    4 "token [ +FORM \\"b\\" ]"))))'
     )
Example #23
0
 def derivation(self):
     """
     Deserialize and return a Derivation object for UDF- or
     JSON-formatted derivation data; otherwise return the original
     string.
     """
     drv = self.get('derivation')
     if drv is not None:
         if isinstance(drv, dict):
             drv = Derivation.from_dict(drv)
         elif isinstance(drv, stringtypes):
             drv = Derivation.from_string(drv)
     return drv
Example #24
0
 def derivation(self):
     """
     Deserialize and return a Derivation object for UDF- or
     JSON-formatted derivation data; otherwise return the original
     string.
     """
     drv = self.get('derivation')
     if drv is not None:
         if isinstance(drv, dict):
             drv = Derivation.from_dict(drv)
         elif isinstance(drv, stringtypes):
             drv = Derivation.from_string(drv)
     return drv
Example #25
0
 def test_to_dict(self):
     s = '(1 some-type -1 -1 -1 ("token"))'
     assert D.from_string(s).to_dict() == {
         'id': 1,
         'entity': 'some-type',
         'score': -1.0,
         'start': -1,
         'end': -1,
         'form': 'token'
     }
     fields = ('id', 'entity', 'score')
     # daughters and form are always shown
     assert D.from_string(s).to_dict(fields=fields) == {
         'id': 1,
         'entity': 'some-type',
         'score': -1.0,
         'form': 'token'
     }
     s = (   r'(1 a-lex -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"'
             r' 3 "token [ +FORM \"b\" ]"))' )
     assert D.from_string(s).to_dict() == {
         'id': 1,
         'entity': 'a-lex',
         'score': -1.0,
         'start': -1,
         'end': -1,
         'form': 'a b',
         'tokens': [
             {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'},
             {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'}
         ]
     }
     assert D.from_string(s).to_dict(fields=fields) == {
         'id': 1,
         'entity': 'a-lex',
         'score': -1.0,
         'form': 'a b'
     }
Example #26
0
def parse_spans(span_lines, derivation_str):
    regex = re.compile(r"\((\d+), \d+, \d+, <(\d+):(\d+)>")
    c_spans = {}
    for line in span_lines.split("\n"):
        m = regex.search(line)
        if not m:
            continue
        key, start, end = m.groups()
        c_spans[int(key)] = (int(start), int(end))

    derivation = Derivation.from_string(derivation_str)  # type: Derivation
    # return [c_spans[j.id] for i in derivation.terminals()
    #         for j in i.tokens]
    return [(c_spans[i.tokens[0].id][0], c_spans[i.tokens[-1].id][1])
            for i in derivation.terminals()]
Example #27
0
def prof_entries(prof,
                 typemap,
                 lexmap,
                 table='result',
                 cols=('derivation', 'mrs')):
    p = itsdb.ItsdbProfile(prof)
    seen = set()
    for derivation, mrs in p.select(table, cols):
        d = Derivation.from_string(derivation)
        for entity, typ, form in _derivation_les(d):
            if typ is None:
                typ = lexmap.get(entity)
            orth = ', '.join('"{}"'.format(part) for part in form)
            if (typ, orth) not in seen and typ in typemap:
                supertype = typemap[typ][0]  # more than 1?
                lename = '+'.join(form) + '-' + supertype
                pred = None
                print(lename, supertype, orth, pred, None)
                yield (lename, supertype, orth, pred, None)
                seen.add((typ, orth))
Example #28
0
 def test_fromstring(self):
     with pytest.raises(ValueError): D.from_string('')
     # root with no children
     with pytest.raises(ValueError): D.from_string('(some-root)')
     # does not start with `(` or end with `)`
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token"))')
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token")) ')
     # uneven parens
     with pytest.raises(ValueError):
         D.from_string('(1 some-type -1 -1 -1 ("token")')
     # ok
     t = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('token')]
     # newlines in tree
     t = D.from_string('''(1 some-type -1 -1 -1
                             ("token"))''')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('token')]
     # LKB-style terminals
     t = D.from_string('''(1 some-type -1 -1 -1
                             ("to ken" 1 2))''')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('to ken')]  # start/end ignored
     # TFS-style terminals
     t = D.from_string(r'''(1 some-type -1 -1 -1
                             ("to ken" 2 "token [ +FORM \"to\" ]"
                                       3 "token [ +FORM \"ken\" ]"))''')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [
         T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'),
                      Tk(3, r'token [ +FORM \"ken\" ]')])
     ]
     # longer example
     t = D.from_string(r'''(root
         (1 some-type 0.4 0 5
             (2 a-lex 0.8 0 1
                 ("a" 1 "token [ +FORM \"a\" ]"))
             (3 bcd-lex 0.5 2 5
                 ("bcd" 2 "token [ +FORM \"bcd\" ]")))
     )''')
     assert t.entity == 'root'
     assert len(t.daughters) == 1
     top = t.daughters[0]
     assert top.id == 1
     assert top.entity == 'some-type'
     assert top.score == 0.4
     assert top.start == 0
     assert top.end == 5
     assert len(top.daughters) == 2
     lex = top.daughters[0]
     assert lex.id == 2
     assert lex.entity == 'a-lex'
     assert lex.score == 0.8
     assert lex.start == 0
     assert lex.end == 1
     assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])]
     lex = top.daughters[1]
     assert lex.id == 3
     assert lex.entity == 'bcd-lex'
     assert lex.score == 0.5
     assert lex.start == 2
     assert lex.end == 5
     assert lex.daughters == [T('bcd',
                                [Tk(2, r'token [ +FORM \"bcd\" ]')])]
Example #29
0
 def test_fromstring(self):
     with pytest.raises(ValueError):
         D.from_string('')
     # root with no children
     with pytest.raises(ValueError):
         D.from_string('(some-root)')
     # does not start with `(` or end with `)`
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token"))')
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token")) ')
     # uneven parens
     with pytest.raises(ValueError):
         D.from_string('(1 some-type -1 -1 -1 ("token")')
     # ok
     t = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [('"token"', )]
     # newlines in tree
     t = D.from_string('''(1 some-type -1 -1 -1
                             ("token"))''')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [('"token"', )]
     # longer example
     t = D.from_string(r'''(root
         (1 some-type 0.4 0 5
             (2 a-lex 0.8 0 1
                 ("a" 1 "token [ +FORM \"a\" ]"))
             (3 bcd-lex 0.5 2 5
                 ("bcd" 2 "token [ +FORM \"bcd\" ]")))
     )''')
     assert t.entity == 'root'
     assert len(t.daughters) == 1
     top = t.daughters[0]
     assert top.id == 1
     assert top.entity == 'some-type'
     assert top.score == 0.4
     assert top.start == 0
     assert top.end == 5
     assert len(top.daughters) == 2
     lex = top.daughters[0]
     assert lex.id == 2
     assert lex.entity == 'a-lex'
     assert lex.score == 0.8
     assert lex.start == 0
     assert lex.end == 1
     assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')]
     lex = top.daughters[1]
     assert lex.id == 3
     assert lex.entity == 'bcd-lex'
     assert lex.score == 0.5
     assert lex.start == 2
     assert lex.end == 5
     assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
Example #30
0
 def test_fromstring(self):
     with pytest.raises(ValueError): D.from_string('')
     # root with no children
     with pytest.raises(ValueError): D.from_string('(some-root)')
     # does not start with `(` or end with `)`
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token"))')
     with pytest.raises(ValueError):
         D.from_string(' (1 some-type -1 -1 -1 ("token")) ')
     # uneven parens
     with pytest.raises(ValueError):
         D.from_string('(1 some-type -1 -1 -1 ("token")')
     # ok
     t = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [('"token"',)]
     # newlines in tree
     t = D.from_string('''(1 some-type -1 -1 -1
                             ("token"))''')
     assert t.id == 1
     assert t.entity == 'some-type'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [('"token"',)]
     # longer example
     t = D.from_string(r'''(root
         (1 some-type 0.4 0 5
             (2 a-lex 0.8 0 1
                 ("a" 1 "token [ +FORM \"a\" ]"))
             (3 bcd-lex 0.5 2 5
                 ("bcd" 2 "token [ +FORM \"bcd\" ]")))
     )''')
     assert t.entity == 'root'
     assert len(t.daughters) == 1
     top = t.daughters[0]
     assert top.id == 1
     assert top.entity == 'some-type'
     assert top.score == 0.4
     assert top.start == 0
     assert top.end == 5
     assert len(top.daughters) == 2
     lex = top.daughters[0]
     assert lex.id == 2
     assert lex.entity == 'a-lex'
     assert lex.score == 0.8
     assert lex.start == 0
     assert lex.end == 1
     assert lex.daughters == [('"a"', '1', r'"token [ +FORM \"a\" ]"')]
     lex = top.daughters[1]
     assert lex.id == 3
     assert lex.entity == 'bcd-lex'
     assert lex.score == 0.5
     assert lex.start == 2
     assert lex.end == 5
     assert lex.daughters == [('"bcd"', '2', r'"token [ +FORM \"bcd\" ]"')]
Example #31
0
 def test_is_root(self):
     a = D.from_string('(1 some-type -1 -1 -1 ("token"))')
     assert a.is_root() == False
     a = D.from_string('(root (1 some-type -1 -1 -1 ("token")))')
     assert a.is_root() == True
     assert a.daughters[0].is_root() == False
Example #32
0
 def test_is_root(self):
     a = D.from_string('(1 some-thing -1 -1 -1 ("token"))')
     assert a.is_root() == False
     a = D.from_string('(root (1 some-thing -1 -1 -1 ("token")))')
     assert a.is_root() == True
     assert a.daughters[0].is_root() == False
Example #33
0
 def test_fromstring(self):
     with pytest.raises(ValueError): D.from_string('')
     # root with no children
     with pytest.raises(ValueError): D.from_string('(some-root)')
     # does not start with `(` or end with `)`
     with pytest.raises(ValueError):
         D.from_string(' (1 some-thing -1 -1 -1 ("token"))')
     with pytest.raises(ValueError):
         D.from_string(' (1 some-thing -1 -1 -1 ("token")) ')
     # uneven parens
     with pytest.raises(ValueError):
         D.from_string('(1 some-thing -1 -1 -1 ("token")')
     # ok
     t = D.from_string('(1 some-thing -1 -1 -1 ("token"))')
     assert t.id == 1
     assert t.entity == 'some-thing'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('token')]
     # newlines in tree
     t = D.from_string('''(1 some-thing -1 -1 -1
                             ("token"))''')
     assert t.id == 1
     assert t.entity == 'some-thing'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('token')]
     # LKB-style terminals
     t = D.from_string('''(1 some-thing -1 -1 -1
                             ("to ken" 1 2))''')
     assert t.id == 1
     assert t.entity == 'some-thing'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [T('to ken')]  # start/end ignored
     # TFS-style terminals
     t = D.from_string(r'''(1 some-thing -1 -1 -1
                             ("to ken" 2 "token [ +FORM \"to\" ]"
                                       3 "token [ +FORM \"ken\" ]"))''')
     assert t.id == 1
     assert t.entity == 'some-thing'
     assert t.score == -1.0
     assert t.start == -1
     assert t.end == -1
     assert t.daughters == [
         T('to ken', [Tk(2, r'token [ +FORM \"to\" ]'),
                      Tk(3, r'token [ +FORM \"ken\" ]')])
     ]
     # longer example
     t = D.from_string(r'''(root
         (1 some-thing 0.4 0 5
             (2 a-lex 0.8 0 1
                 ("a" 1 "token [ +FORM \"a\" ]"))
             (3 bcd-lex 0.5 2 5
                 ("bcd" 2 "token [ +FORM \"bcd\" ]")))
     )''')
     assert t.entity == 'root'
     assert len(t.daughters) == 1
     top = t.daughters[0]
     assert top.id == 1
     assert top.entity == 'some-thing'
     assert top.score == 0.4
     assert top.start == 0
     assert top.end == 5
     assert len(top.daughters) == 2
     lex = top.daughters[0]
     assert lex.id == 2
     assert lex.entity == 'a-lex'
     assert lex.score == 0.8
     assert lex.start == 0
     assert lex.end == 1
     assert lex.daughters == [T('a', [Tk(1, r'token [ +FORM \"a\" ]')])]
     lex = top.daughters[1]
     assert lex.id == 3
     assert lex.entity == 'bcd-lex'
     assert lex.score == 0.5
     assert lex.start == 2
     assert lex.end == 5
     assert lex.daughters == [T('bcd',
                                [Tk(2, r'token [ +FORM \"bcd\" ]')])]
Example #34
0
    def test_to_dict(self):
        s = '(1 some-thing -1 -1 -1 ("token"))'
        assert D.from_string(s).to_dict() == {
            'id': 1,
            'entity': 'some-thing',
            'score': -1.0,
            'start': -1,
            'end': -1,
            'form': 'token'
        }
        fields = ('id', 'entity', 'score')
        # daughters and form are always shown
        assert D.from_string(s).to_dict(fields=fields) == {
            'id': 1,
            'entity': 'some-thing',
            'score': -1.0,
            'form': 'token'
        }
        s = (r'(root (0 top@top-rule -1 -1 -1'
             r' (1 a-lex@a-type -1 -1 -1 ("a b" 2 "token [ +FORM \"a\" ]"'
             r'  3 "token [ +FORM \"b\" ]"))'
             r' (4 ^c-lex@c-type -1 -1 -1 ("c" 5 "token [ +FORM \"c\" ]"))))')
        assert D.from_string(s).to_dict() == {
            'entity': 'root',
            'daughters': [
                {
                    'id': 0,
                    'entity': 'top',
                    'type': 'top-rule',
                    'score': -1.0,
                    'start': -1,
                    'end': -1,
                    'daughters': [
                        {
                            'id': 1,
                            'entity': 'a-lex',
                            'type': 'a-type',
                            'score': -1.0,
                            'start': -1,
                            'end': -1,
                            'form': 'a b',
                            'tokens': [
                                {'id': 2, 'tfs': r'token [ +FORM \"a\" ]'},
                                {'id': 3, 'tfs': r'token [ +FORM \"b\" ]'}
                            ]
                        },
                        {
                            'id': 4,
                            'entity': 'c-lex',
                            'type': 'c-type',
                            'head': True,
                            'score': -1.0,
                            'start': -1,
                            'end': -1,
                            'form': 'c',
                            'tokens': [
                                {'id': 5, 'tfs': r'token [ +FORM \"c\" ]'}
                            ]
                        }
                    ]
                }
            ]
        }
        assert D.from_string(s).to_dict(fields=fields) == {
            'entity': 'root',
            'daughters': [
                {
                    'id': 0,
                    'entity': 'top',
                    'score': -1.0,
                    'daughters': [
                        {
                            'id': 1,
                            'entity': 'a-lex',
                            'score': -1.0,
                            'form': 'a b'
                        },
                        {
                            'id': 4,
                            'entity': 'c-lex',
                            'score': -1.0,
                            'form': 'c'
                        }

                    ]
                }
            ]
        }