def find_entities_spans(self, question):
        q_word = self._extract_question_word(question)
        syntax_parse = self._get_synt_parse(question)
        root = parse_tree(syntax_parse)[0]
        if root.token['form'] == q_word:
            for t in root.children:
                if t.token['deprel'] == 'nsubj':
                    root = t
                    break

        entity_tokens = []
        for t in root.children:
            syntree = SynTree(t)
            if not syntree.has_token(q_word) and not syntree.ignored():
                entity_tokens.append(syntree.get_ordered_tokens())

        if not entity_tokens:
            spans = []
        else:
            spans = [
                ' '.join(list(zip(*sorted(et, key=itemgetter(0))))[1])
                for et in entity_tokens
            ]

        return spans
Example #2
0
    def test_parse_tree(self):
        sentences = parse_tree(data)
        self.assertEqual(len(sentences), 1)

        root = sentences[0]
        self.assertEqual(text(root), "TokenTree<token={id=5, form=jumps}, children=[...]>")

        self.assertEqual(
            root.token,
            OrderedDict([
                ('id', 5),
                ('form', 'jumps'),
                ('lemma', 'jump'),
                ('upostag', 'VERB'),
                ('xpostag', 'VBZ'),
                ('feats', OrderedDict([
                    ("Mood", "Ind"),
                    ("Number", "Sing"),
                    ("Person", "3"),
                    ("Tense", "Pres"),
                    ("VerbForm", "Fin"),
                ])),
                ('head', 0),
                ('deprel', 'root'),
                ('deps', None),
                ('misc', None)
            ])
        )

        self.assertEqual(
            [text(child) for child in root.children],
            [
                "TokenTree<token={id=4, form=fox}, children=[...]>",
                "TokenTree<token={id=9, form=dog}, children=[...]>",
                "TokenTree<token={id=10, form=.}, children=None>",
            ]
        )

        self.assertEqual(
            root.metadata["text"],
            "The quick brown fox jumps over the lazy dog."
        )

        self.assertEqual(root.serialize(), data)

        self.assertEqual(
            capture_print(root.print_tree),
            dedent("""\
                (deprel:root) form:jumps lemma:jump upostag:VERB [5]
                    (deprel:nsubj) form:fox lemma:fox upostag:NOUN [4]
                        (deprel:det) form:The lemma:the upostag:DET [1]
                        (deprel:amod) form:quick lemma:quick upostag:ADJ [2]
                        (deprel:amod) form:brown lemma:brown upostag:ADJ [3]
                    (deprel:nmod) form:dog lemma:dog upostag:NOUN [9]
                        (deprel:case) form:over lemma:over upostag:ADP [6]
                        (deprel:det) form:the lemma:the upostag:DET [7]
                        (deprel:amod) form:lazy lemma:lazy upostag:ADJ [8]
                    (deprel:punct) form:. lemma:. upostag:PUNCT [10]
            """)
        )
Example #3
0
def case_fox():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "fox [The, quick, brown]",
        "repr": [
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form', 3), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Example #4
0
def case_apples():

    data = """
# text = apples, pears, oranges, and bananas.
1   apples   apple  NOUN    NN   Number=Plur                  0   obj    _   _
2   ,     ,    PUNCT   ,   _                 3   punct   _   _
3   pears     pear    NOUN   NN   Number=Plur                 1   conj   _   _
4   ,     ,    PUNCT   ,   _    5   punct   _   _
5   oranges     orange    NOUN   NN   Number=Plur                 1   conj   _   _
6   ,     ,    PUNCT   ,   _                 8   punct   _   _
7   and   and   SCONJ   CC  _   8   cc    _   _
8   bananas    banana   NOUN    NN   Number=Plur                           1   conj    _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        8,
        "str":
        "apples [pears,, oranges,, bananas [,, and]]",
        "repr": [
            PatternElement('apples', 'form', 1), SNGram.LEFT_BRACKET,
            PatternElement('pears', 'form', 3),
            PatternElement(',', 'form', 2), SNGram.COMMA,
            PatternElement('oranges', 'form', 5),
            PatternElement(',', 'form', 4), SNGram.COMMA,
            PatternElement('bananas', 'form', 8), SNGram.LEFT_BRACKET,
            PatternElement(',', 'form', 6), SNGram.COMMA,
            PatternElement('and', 'form', 7), SNGram.RIGHT_BRACKET,
            SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form form , form form , form [ form , form ] ]"])
    }
Example #5
0
 def __call__(self, text):
     sentences = self.tokenize(text)
     for s in sentences:
         self.tag(s)
         self.parse(s)
     conllu = self.write(sentences, "conllu")
     return parse_tree(conllu)
Example #6
0
def case_sidorov2():

    data = """
# text = y le di un par de vueltas de_mala_gana
1   y              _  _  _  _  0  _  _  _
2   le             _  _  _  _  3  _  _  _
3   di             _  _  _  _  1  _  _  _
4   par            _  _  _  _  3  _  _  _
5   de_mala_gana   _  _  _  _  3  _  _  _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        5,
        "str":
        "y di [le, par, de_mala_gana]",
        "repr": [
            PatternElement('y', 'form', 1),
            PatternElement('di', 'form', 3), SNGram.LEFT_BRACKET,
            PatternElement('le', 'form', 2), SNGram.COMMA,
            PatternElement('par', 'form', 4), SNGram.COMMA,
            PatternElement('de_mala_gana', 'form', 5), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form form [ form , form , form ]"])
    }
Example #7
0
def case_changed_special():

    data = """
# text = The quick brown fox
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 0   nsubj   _   _

"""

    return TokenSNGram(conllu.parse_tree(data)[0],
                       left_bracket="(",
                       right_bracket=")",
                       comma="_"), {
                           "length":
                           4,
                           "str":
                           "fox (The_ quick_ brown)",
                           "repr": [
                               PatternElement('fox', 'form', 4), "(",
                               PatternElement('The', 'form', 1), "_",
                               PatternElement('quick', 'form', 2), "_",
                               PatternElement('brown', 'form', 3), ")"
                           ],
                           "profiles":
                           set(["form ( form _ form _ form )"])
                       }
Example #8
0
def load_data_file(file_name, tree=False):
    with open(file_name) as f:
        text = f.read()

    if tree:
        return (parse(text), parse_tree(text))
    else:
        return parse(text)
Example #9
0
    def test_parse_tree_and_serialize(self):
        from tests.fixtures import TESTCASES

        for testcase in TESTCASES:
            data = parse(testcase)
            testcase_without_range_and_elided = TokenList(
                [token for token in data[0] if isinstance(token["id"], int)])
            self.assertEqual(
                parse_tree(testcase)[0].serialize(),
                testcase_without_range_and_elided)
Example #10
0
def process_conllu(inp):
    root = parse_tree(inp)[0]
    data = {}
    consts = []
    for const in depth_first(root):
        consts.append(const)
        w = const['form']
        deprel = const['deprel']
        data[w] = deprel
    return data, consts
Example #11
0
def read_data(filename):
    test = open(filename, 'r')
    text = test.read()
    test.close()
    # print text
    data = parse(
        text
    )  #parse(text)[i] represents the dependency relation of ith sentence
    tree = parse_tree(text)
    return (data, tree)
Example #12
0
def parse_tree_conll(path_to_file: str) -> list:
    """
    Read a CoNLL file and return
    a list of sentences as TokenTree objects
    (TokenTree == arborised hierarchal structures).
    
    :param path_to_file: Path to the conll file
    :returns: a list of sentences as TokenTree objects
    """

    return parse_tree(load_conll(path_to_file))
Example #13
0
def process_conllu(inp):
    tree = parse_tree(inp)
    root = tree[0]
    data = {}
    #path = 0
    for const in depth_first(root):
        #print(const)
        w = const.token['form']
        deprel = const.token['deprel']
        data[w] = deprel
    return data
Example #14
0
 def analyze(self, themes, filename, encoding='utf8'):
     self.__checkerThemes__(themes)
     print('Updating model with text... ', end='')
     self.__srem__.trainFile(filename, encoding=encoding)
     print('[OK]')
     print('Parsing sentences... ', end='')
     with open(filename, 'r', encoding=encoding) as file:
         for index, line in enumerate(file, start=1):
             processed_conllu = self.__pipeline__.process(line, self.__uderror__)
             if self.__uderror__.occurred():
                 raise RuntimeError('UDPipe error: ' + self.__uderror__.message)
             sentence_root = parse_tree(processed_conllu)[0]
             self.__evalTreeSentence__(themes, sentence_root)
     print('[OK]')
Example #15
0
def test_deptree():
    data = """# text = the cat chases the mouse
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
4   cat     cat    NOUN   NN   Number=Sing                 5   nsubj   _   _
5   chases   chase   VERB   VBZ  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0   root    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
9   mouse    mouse    NOUN   NN   Number=Sing                 5   dobj    _   SpaceAfter=No

# text = the cat sleeps
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
4   cat     cat    NOUN   NN   Number=Sing                 5   nsubj   _   _
5   sleeps   sleep   VERB   VBZ  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0   root    _   _
"""
    examples = conllu.parse_tree(data)
    kernel = Kernel(label=DT.label, children=DT.children)
    assert kernel(examples[1], examples[1]) == 3.0
    kernel_0 = Kernel()
    tree = Tree.fromstring("(root (nsubj det) (dobj det))")
    assert kernel_0(tree, tree) == kernel(examples[0], examples[0])
Example #16
0
def case_jumps():

    data = """
# text = The quick brown fox jumps over the lazy dog.
1   The     the    DET    DT   Definite=Def|PronType=Art   4   det     _   _
2   quick   quick  ADJ    JJ   Degree=Pos                  4   amod    _   _
3   brown   brown  ADJ    JJ   Degree=Pos                  4   amod    _   _
4   fox     fox    NOUN   NN   Number=Sing                 5   nsubj   _   _
5   jumps   jump   VERB   VBZ  Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0   root    _   _
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 5   nmod    _   SpaceAfter=No
10  .       .      PUNCT  .    _                           5   punct   _   _

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        10,
        "str":
        "jumps [fox [The, quick, brown], dog [over, the, lazy], .]",
        "repr": [
            PatternElement('jumps', 'form', 5), SNGram.LEFT_BRACKET,
            PatternElement('fox', 'form', 4), SNGram.LEFT_BRACKET,
            PatternElement('The', 'form', 1), SNGram.COMMA,
            PatternElement('quick', 'form', 2), SNGram.COMMA,
            PatternElement('brown', 'form',
                           3), SNGram.RIGHT_BRACKET, SNGram.COMMA,
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET,
            SNGram.COMMA,
            PatternElement('.', 'form', 10), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set([
            "form [ form [ form , form , form ] , form [ form , form , form ] , form ]"
        ])
    }
Example #17
0
def case_dog():

    data = """
# text = over the lazy dog
6   over    over   ADP    IN   _                           9   case    _   _
7   the     the    DET    DT   Definite=Def|PronType=Art   9   det     _   _
8   lazy    lazy   ADJ    JJ   Degree=Pos                  9   amod    _   _
9   dog     dog    NOUN   NN   Number=Sing                 0   nmod    _   SpaceAfter=No

"""
    return TokenSNGram(conllu.parse_tree(data)[0]), {
        "length":
        4,
        "str":
        "dog [over, the, lazy]",
        "repr": [
            PatternElement('dog', 'form', 9), SNGram.LEFT_BRACKET,
            PatternElement('over', 'form', 6), SNGram.COMMA,
            PatternElement('the', 'form', 7), SNGram.COMMA,
            PatternElement('lazy', 'form', 8), SNGram.RIGHT_BRACKET
        ],
        "profiles":
        set(["form [ form , form , form ]"])
    }
Example #18
0
# quick

sentence.filter(feats__Degree="Pos")
# TokenList<quick, brown, lazy>

sentence.metadata

### Turn a TokenList back into CoNLL-U
sentence.serialize()  # The format is not desirable

### Turn a Tokenlist into a TokenTree
sentence.to_tree()

### Use parse_tree() to parse into a list of dependency trees
from conllu import parse_tree
sentences = parse_tree(data)
sentences

from conllu import parse_tree_incr
for tokentree in parse_tree_incr(data_file):
    print(tokentree)

root = sentences[0]
root

root.print_tree()

root.token

children = root.children
children
Example #19
0
 def test_parse_tree_incr(self):
     self.assertEqual(parse_tree(data), list(parse_tree_incr(StringIO(data))))
Example #20
0
 def test_parse_tree_incr(self):
     self.assertEqual(parse_tree(data), list(parse_tree_incr(string_to_file(data))))