Example #1
0
    def prune_tree(cls, tree, begin_index, end_index):
        """
        Prune the tree that include the begin_index and the end_index
        so that it doesn't include leaves outside of the range limited
        by begin_index and end_index
        """

        begin_path = tree.leaf_treeposition(begin_index)
        end_path = tree.leaf_treeposition(end_index)

        current_node = tree[begin_path[:-1]]
        end_node = tree[end_path[:-1]]

        new_tree = ParentedTree('(' + tree.node + ')')
        ## Initialize new tree
        l = []
        current_new = new_tree
        current_old = tree
        for i in xrange(len(begin_path) - 1):
            if type(current_old[begin_path[i]]) != str:
                current_new.insert(
                    0,
                    ParentedTree('(' + current_old[begin_path[i]].node + ')'))
                current_new = current_new[0]
                current_old = current_old[begin_path[i]]

        while current_old != end_node:
            if not (type(current_old[0]) == str
                    or type(current_old[0]) == unicode):
                current_old = current_old[0]
                current_new.insert(0,
                                   ParentedTree('(' + current_old.node + ')'))
                current_new = current_new[0]
            else:
                current_new.insert(0, current_old[0])
                while len(current_old.parent()
                          ) == current_old.parent_index() + 1:
                    current_old = current_old.parent()
                    current_new = current_new.parent()

                current_old = current_old.parent()[current_old.parent_index() +
                                                   1]
                current_new.parent().insert(
                    current_new.parent_index() + 1,
                    ParentedTree('(' + current_old.node + ')'))

                current_new = current_new.parent()[current_new.parent_index() +
                                                   1]
        current_new.insert(0, current_old[0])
        #         print current_new
        return new_tree
Example #2
0
    def __init__(self,
                 id_sentence,
                 basic_dependencies=None,
                 collapsed_dependencies=None,
                 parsetree='',
                 text=''):
        from nltk import ParentedTree

        assert type(id_sentence) == int, 'Wrong id type'
        assert basic_dependencies is None or \
            type(basic_dependencies) == list, 'Basic dependencies type'
        assert collapsed_dependencies is None or \
            type(collapsed_dependencies) == list, 'Collapsed dependencies type'
        if text:
            assert type(text) == list, 'Wrong text type'

        self.id_sentence = id_sentence
        self.basic_dependencies = DependencyGraph(basic_dependencies)
        self.collapsed_dependencies = DependencyGraph(collapsed_dependencies)
        self._parsetree = parsetree
        self.parsetree = ParentedTree(parsetree)
        self.words = []
        self.next = None
        self.previous = None
        self.coreference_mentions = []
        self.coreference_representatives = []
        self._connected_sentences = None
Example #3
0
def sentence_to_tree(sentence):
    """
    Given a sentence (as a text), it will transform it to a tree.

    Args:
        sentence: text of a sentence
    Return:
        sentence tree
    """
    assert isinstance(sentence, basestring)

    sentence = pos_tag(word_tokenize(sentence))
    tree = ParentedTree('S', [])
    for token in sentence:
        word, pos = token
        tree.append(ParentedTree(pos, [word]))
    return tree
def compute_gender(attributes):
    """ Compute the gender of a mention.

    Args:
        attributes (dict(str, object)): Attributes of the mention, must contain
            values for "type", "head", "head_index" and, if the mention is a
            pronoun, "citation_form".

    Returns:
        str: the number of the mention -- one of UNKNOWN, MALE, FEMALE,
            NEUTRAL and PLURAL.
    """
    gender = "NEUTRAL"
    head_index = attributes["head_index"]
    gender_data = external_data.GenderData.get_instance()

    if attributes["head"] != [] and type(attributes["head"][0]) == type(
            ParentedTree('DT', ['a'])):
        attributes["head"] = []
        for i in itertools.chain.from_iterable(attributes["head"]):
            attributes["head"].append(i.leaves())
    if compute_number(attributes) == "PLURAL":
        gender = "PLURAL"
    elif attributes["type"] == "PRO":
        if attributes["citation_form"] == "he":
            gender = "MALE"
        elif attributes["citation_form"] == "she":
            gender = "FEMALE"
        elif attributes["citation_form"] == "it":
            gender = "NEUTRAL"
        elif attributes["citation_form"] in ["you", "we", "they"]:
            gender = "PLURAL"
    elif attributes["type"] == "NAM":
        if re.match(r"^mr(\.)?$", attributes["tokens"][0].lower()):
            gender = "MALE"
        elif re.match(r"^(miss|ms|mrs)(\.)?$",
                      attributes["tokens"][0].lower()):
            gender = "FEMALE"
        elif not re.match(r"(PERSON|NONE)", attributes["ner"][head_index]):
            gender = "NEUTRAL"
        elif gender_data.look_up(attributes):
            gender = gender_data.look_up(attributes)
    elif attributes["type"] == "NOM":
        # print(attributes["head"][0])
        # print(type(attributes["head"][0]))
        # print(attributes["head"] == [] or type(attributes["head"][0]) != type(u'qwe'))
        if attributes["head"] == [] or type(
                attributes["head"][0]) != type(u'qwe'):
            pass
        elif __wordnet_lookup_gender(" ".join(attributes["head"])):
            gender = __wordnet_lookup_gender(" ".join(attributes["head"]))
        elif gender_data.look_up(attributes):
            gender = gender_data.look_up(attributes)

    if gender == "NEUTRAL" and compute_semantic_class(attributes) == "PERSON":
        gender = "UNKNOWN"

    return gender
Example #5
0
def terms_inference(sentences, terms_trie):
    """
    Given (tokenized and tagged) sentences and a trie of terms, it will
    infere terms occurences and return list of sentence trees.

    Args:
        sentences: shallow-parsed text
        terms_trie: trie of terms
    Return:
        list of shallow parse trees with inferred terms,
        dictionary of refferences to terms positions
    """
    parsed_sentences = []
    terms_positions = defaultdict(list)
    for sentence in sentences:
        parsed_sentence = ParentedTree('S', [])

        token_index = 0
        while token_index < len(sentence):
            term_label, term_length = _longest_matching_term(
                sentence, token_index, terms_trie)

            if term_length > 0:
                # term found
                term_node = ParentedTree('TERM', [])

                term = name_to_term(term_label)
                term_node.term = term
                terms_positions[term].append(term_node)

                for token in sentence[token_index:token_index + term_length]:
                    _append_word_token(term_node, token)
                parsed_sentence.append(term_node)

                token_index += term_length

            else:
                # there is no term starting from current postion
                token = sentence[token_index]
                _append_word_token(parsed_sentence, token)
                token_index += 1

        parsed_sentences.append(parsed_sentence)

    return parsed_sentences, terms_positions
Example #6
0
    def traverse(graph, node):

        children = [int(c) for c in graph[node]["children"]]
        tagged_children = []
        for child in children:
            ellipsed_parents = [
                int(p) for p in graph[child]["ellipsed_parents"]
            ]
            # if the child is explicit
            if node not in ellipsed_parents:
                if graph[child]["terminal"] == "yes":
                    tagged_children.append(
                        ParentedTree(graph[child]["tag"],
                                     [graph[child]["text"]]))
                else:
                    tagged_children.append(traverse(graph, child))

        tree = ParentedTree(graph[node]["tag"], tagged_children)

        return tree
Example #7
0
def norm_negation(node):
    if not isinstance(node, Tree):
        return
    for i, ni in enumerate(node):
        # is it a negation functor?
        if isinstance(ni, ParentedTree) and ni.label() == 'compound' and \
                ni[0].label() == 'functor' and ni[0][0].val in ['\\+','not']:
            # take first argument
            first = ni[1][0]
            if isinstance(first, ParentedTree):
                first._parent = None
            # create a new tree
            ni = node[i] = ParentedTree(
                'unop', [Token('NOT', '\\+', ni[0][0].pos), first])
        norm_negation(ni)
Example #8
0
def _append_word_token(node, token):
    word, pos_tag = token
    node.append(ParentedTree(pos_tag, [word]))
Example #9
0
def traverse_graph_start_end_extra_node(graph):
    """
    Convert a single graph to a phrase-structure tree, 
    encoding ellipsis by wrapping the start and ending nodes of the ellipsis edge with extra nodes.

    Example: (CLX (CL (NGend0 (PRP They)) (VG (VBDend1 were) (VBG drinking)) (NG (NN tea))) (CL (CONJG (CCstart0 (CC and))) (VGstart1 (VG (VBG eating)) (NG (NN scons))))
    """

    # get tree with starting node tags

    def traverse(graph, node):

        children = [int(c) for c in graph[node]["children"]]
        tagged_children = []
        for child in children:
            ellipsed_parents = [
                int(p) for p in graph[child]["ellipsed_parents"]
            ]
            # if the child is explicit
            if node not in ellipsed_parents:
                if graph[child]["terminal"] == "yes":
                    tagged_children.append(
                        ParentedTree(graph[child]["tag"],
                                     [graph[child]["text"]]))
                else:
                    tagged_children.append(traverse(graph, child))
            # if the child is ellipsed
            else:
                ellipsis_tag = get_ellipsis_tag_from_graph(graph, child)
                tagged_children.append(ParentedTree(ellipsis_tag, []))

        tree = ParentedTree(graph[node]["tag"], tagged_children)

        return tree

    tree = traverse(graph, 0)

    # get ending node tags
    positions = [
        pos for pos in tree.treepositions()
        if pos not in tree.treepositions("leaves")
    ]
    end_tags = []
    ellipsis_id = 0  # assign an id to each ellipsis start and end nodes
    for pos_i, pos in enumerate(positions):
        if tree[pos].label().startswith("start"):
            ellipsis_tag = tree[pos].label().split("start")[-1]
            tree[pos].set_label("start" + str(ellipsis_id))
            end_location = get_ellipsis_location(tree, ellipsis_tag)
            end_tag = "end" + str(ellipsis_id)
            end_tags.append((end_location, end_tag))
            ellipsis_id += 1

    # insert ending node tags
    for index, st in enumerate(tree.subtrees()):
        for end_location, end_tag in end_tags:
            if st.treeposition() == end_location:
                st.insert(index, ParentedTree(end_tag, []))

    positions = [
        pos for pos in tree.treepositions()
        if pos not in tree.treepositions("leaves")
    ]
    rev_positions = [pos for pos in reversed(positions)]
    for pos_i, pos in enumerate(rev_positions):
        # append start tag to the previous node
        if tree[pos].label().startswith("start"):
            prev_pos_i = pos_i + 1
            prev_pos = rev_positions[prev_pos_i]
            tree[prev_pos].set_label(tree[prev_pos].label() +
                                     tree[pos].label())
            del tree[pos]
        # append end tag to the parent of the current node
        elif tree[pos].label().startswith("end"):
            parent_pos = tree[pos].parent().treeposition()
            tree[parent_pos].set_label(tree[parent_pos].label() +
                                       tree[pos].label())
            del tree[pos]

    # wrap each constituent that has end or start tags with extra nodes

    def add_extra_nodes(tree):
        children = []
        for subtree in tree:
            if type(subtree) == str:
                children.append(subtree)
            else:
                splits = re.split("(start|end)", subtree.label())
                const_tag = splits[0]
                ellipsis_tag = "".join(splits[1:])
                if len(ellipsis_tag) > 0:
                    children.append(
                        Tree(subtree.label(),
                             [Tree(const_tag, [sst for sst in subtree])]))
                else:
                    children.append(add_extra_nodes(subtree))

        return Tree(tree.label(), children)

    tree = add_extra_nodes(tree)

    return tree
Example #10
0
def traverse_graph_end(graph):
    """
    Convert a single graph to a phrase-structure tree, 
    encoding ellipsis by appending a tag to the ending node of the ellipsis edge.

    Example: (CLX (CL (NGendCC0 (PRP They)) (VG (VBDendVG1 were) (VBG drinking)) (NG (NN tea))) (CL (CONJG (CC and)) (VG (VBG eating)) (NG (NN scons))))
    """

    # get tree with starting node tags

    def traverse(graph, node):

        children = [int(c) for c in graph[node]["children"]]
        tagged_children = []
        for child in children:
            ellipsed_parents = [
                int(p) for p in graph[child]["ellipsed_parents"]
            ]
            # if the child is explicit
            if node not in ellipsed_parents:
                if graph[child]["terminal"] == "yes":
                    tagged_children.append(
                        ParentedTree(graph[child]["tag"],
                                     [graph[child]["text"]]))
                else:
                    tagged_children.append(traverse(graph, child))
            # if the child is ellipsed
            else:
                ellipsis_tag = get_ellipsis_tag_from_graph(graph, child)
                tagged_children.append(ParentedTree(ellipsis_tag, []))

        tree = ParentedTree(graph[node]["tag"], tagged_children)

        return tree

    tree = traverse(graph, 0)

    # get ending node tags
    positions = [
        pos for pos in tree.treepositions()
        if pos not in tree.treepositions("leaves")
    ]
    end_tags = []
    for pos_i, pos in enumerate(positions):
        if tree[pos].label().startswith("start"):
            ellipsis_tag = tree[pos].label().split("start")[-1]
            end_location = get_ellipsis_location(tree, ellipsis_tag)
            start_location = pos_i
            while tree[positions[start_location]].label().startswith("start"):
                start_location -= 1
            end_tag = get_ellipsis_tag_from_tree(tree,
                                                 positions[start_location])
            end_tags.append((end_location, end_tag))

    # insert ending node tags
    for index, st in enumerate(tree.subtrees()):
        for end_location, end_tag in end_tags:
            if st.treeposition() == end_location:
                st.insert(index, ParentedTree(end_tag, []))

    # delete starting node tags
    subtrees = [st for st in tree.subtrees()]
    reversed_subtrees = [st for st in reversed(subtrees)]
    for st in reversed_subtrees:
        if st.label().startswith("start"):
            del tree[st.treeposition()]

    positions = [
        pos for pos in tree.treepositions()
        if pos not in tree.treepositions("leaves")
    ]
    rev_positions = [pos for pos in reversed(positions)]
    for pos_i, pos in enumerate(rev_positions):
        # append ending node tag to the parent of the current node
        if tree[pos].label().startswith("end"):
            parent_pos = tree[pos].parent().treeposition()
            tree[parent_pos].set_label(tree[parent_pos].label() +
                                       tree[pos].label())
            del tree[pos]

    return tree
Example #11
0
def getHead(syntac_sen):
    t = ParentedTree(syntac_sen.text)

    target = t[0]

    while target.height() != 2:
        ### non-trivial rules: no.1
        flag = 0
        parent = target
        if target.node == "SBARQ":
            for ts in target:
                if ts.node in ["WHNP", "WHPP", "WHADJP", "WHADVP"
                               ] and len(ts) > 1:

                    target = ts
                    flag = 1
                    break
        ###
        if not flag:
            rules = head_trace_rule[target.node]
            #rules = head_trace_rule.get(target.node, [])
            for rule in rules:
                if rule[0] == "L":
                    newTarget = LookByL(target, rule[1:])
                elif rule[0] == "R":
                    newTarget = LookByR(target, rule[1:])
                elif rule[0] == "LBP":
                    newTarget = LookByLBP(target, rule[1:])
                elif rule[0] == "RBP":
                    newTarget = LookByRBP(target, rule[1:])
                if newTarget != "":
                    break
            if newTarget == "":
                target = target[0]
            else:
                target = newTarget
            #print target
            #print target.height()

        ### non-trivial rules: no.2:
        if flag:
            leafPos = getLeafPOS(target)
            m = re.search(r'(NN|NNS)_(\d+) POS_', leafPos)
            if m != None:
                lvs = target.leaves()
                print m.groups()
                target = ParentedTree("(" + m.group(1) + " " +
                                      lvs[int(m.group(2))] + ")")

        ### non-trivial rules: no.3

        if target.height() == 2 and target.leaves()[0] in [
                "name", "kind", "type", "genre", "group", "part"
        ]:
            print parent
            for k in parent:
                if k.node == "PP":
                    target = k
                    break
            pr = parent.right_sibling()
            for p in pr:
                if pr.node == "PP":
                    target = pr
                    break

    return target.leaves()[0]
Example #12
0
    def get_pruned_tree_path(self,
                             index_1_beg,
                             index_1_end,
                             index_2_beg,
                             index_2_end,
                             in_between_children=False):
        """
        Get the path in the syntactic tree
        between two extends.
        The particular purpose of the method in the task
        is to find the minimum tree that connects between two events,
        removing the POS and LEMMA of single token entity,
        removing internal structure of multiple token entity
        (consider the multiple token entity as one node in the tree)
        removing branches and leaves in between two entities
        Parameters:
            - index_1_beg, index_1_end: begin and end of the first entity,
                index_1_end is exclusive
            - index_2_beg, index_2_end: begin and end of the second entity,
                index_2_end is exclusive
            - in_between_children: a flag whether to include the first level of children
                of the common ancestor of two entities
        """
        tempo_2_beg = index_2_beg
        tempo_2_end = index_2_end
        if index_1_beg >= index_2_end:
            index_2_beg = index_1_beg
            index_2_end = index_1_end
            index_1_beg = tempo_2_beg
            index_1_end = tempo_2_end

        if index_1_end - index_1_beg > 1:
            lca_1_index = self.tree.treeposition_spanning_leaves(
                index_1_beg, index_1_end)
        else:
            lca_1_index = self.tree.treeposition_spanning_leaves(
                index_1_beg, index_1_end)[:-1]

        if index_2_end - index_2_beg > 1:
            lca_2_index = self.tree.treeposition_spanning_leaves(
                index_2_beg, index_2_end)
        else:
            lca_2_index = self.tree.treeposition_spanning_leaves(
                index_2_beg, index_2_end)[:-1]

        if index_2_end - index_1_beg > 1:
            lca_index = self.tree.treeposition_spanning_leaves(
                index_1_beg, index_2_end)
        else:
            lca_index = self.tree.treeposition_spanning_leaves(
                index_1_beg, index_2_end)[:-1]

        lca = self.tree[lca_index]
        new_tree = ParentedTree('(' + lca.node + ')')

        #Point to the root
        # Branch of the first entity
        current_pointer = new_tree
        tempo_lca = lca
        #         try:
        for i in xrange(len(lca_index), len(lca_1_index)):
            tempo_lca = tempo_lca[lca_1_index[i]]
            if not (type(tempo_lca) == str or type(tempo_lca) == unicode):
                current_pointer.insert(
                    0, ParentedTree('(' + tempo_lca.node + ')'))
                current_pointer = current_pointer[0]

        current_pointer = new_tree
        #Insert the first level of children of lca
        if len(lca_index) < len(lca_1_index) and len(lca_index) < len(
                lca_2_index):
            if in_between_children:
                for i in xrange(lca_1_index[len(lca_index)] + 1,
                                lca_2_index[len(lca_index)]):
                    current_pointer.insert(
                        i, ParentedTree('(' + lca[i].node + ')'))

        #Point to the root
        # Branch of the second entity
        current_pointer = new_tree
        tempo_lca = lca
        first_time = True
        for i in xrange(len(lca_index), len(lca_2_index)):
            tempo_lca = tempo_lca[lca_2_index[i]]
            if not (type(tempo_lca) == str or type(tempo_lca) == unicode):
                if first_time:
                    if not in_between_children:
                        children_index_of_2nd_branch = 1
                    else:
                        """
                        Don't really need to check lca_2_index[len(lca_index)]
                        'cause if it come to this point, the length constraint
                        is already satisfied
                        However, it's necessary to check lca_1_index[len(lca_index)]
                        """
                        if len(lca_index) < len(lca_1_index):
                            children_index_of_2nd_branch = lca_2_index[len(lca_index)]\
                                                           - lca_1_index[len(lca_index)]
                        else:
                            """
                            No left child, no in_between_children
                            """
                            children_index_of_2nd_branch = 0
                    current_pointer.insert(
                        children_index_of_2nd_branch,
                        ParentedTree('(' + tempo_lca.node + ')'))
                    current_pointer = current_pointer[
                        children_index_of_2nd_branch]
                    first_time = False
                else:
                    current_pointer.insert(
                        0, ParentedTree('(' + tempo_lca.node + ')'))
                    current_pointer = current_pointer[0]
        return new_tree