Ejemplo n.º 1
0
def _preprosess(root: ParentedTree):
    """
    Preprocesses the lexcial tree: clean the syntactic tags and replace each token value with its
    index number.
    :param root: the root of the lexcial tree
    :return: a tuple of a processed tree and a sequence of (tag, token)
    """
    root: ParentedTree = root.copy(deep=True)

    def __iterate(tree: ParentedTree, index: int = 1):
        # clean the tags which contains '-'
        if '-' in tree.label():
            tree.set_label(tree.label().split('-')[0])
        if _is_leaf(tree):
            yield tree.label(), tree[0]  # (tag, token)
            tree[0] = index  # replace the token with its index number
            index += 1
        else:
            for subtree in tree:
                for _item in __iterate(subtree, index):
                    yield _item
                    index += 1

    # i.e. [('NR', '上海'), ('NR', '浦东'), ('NN', '开发'), ('CC', '与'), ...]
    sequences = [i for i in __iterate(root)]

    return root, sequences
Ejemplo n.º 2
0
    def __mark_heads(self, root: ParentedTree):
        """
        Marks the head of each phrase.
        :param root: a preprocessed phrase tree.
        :return: a phrase tree with head labels
        """
        root: ParentedTree = root.copy(deep=True)

        def __iterate(tree: ParentedTree):
            label = tree.label()

            if _is_leaf(tree):
                tree.set_label('{}|{}'.format(tree[0], label))
            else:
                for subtree in tree:
                    __iterate(subtree)

                # just select the last one as the head if the tag is not covered by the head rules
                if label not in self.head_rules:
                    index = tree[-1].label().split('|')[0]
                    tree.set_label('{}|{}'.format(index, label))
                    return

                for rule in self.head_rules[label]:
                    sub_labels = [t.label().split('|') for t in tree]
                    if rule['direction'] == 'r':
                        sub_labels = sub_labels[::-1]  # reverse

                    # this is the last rule, just select the first or last one as the head
                    if not rule['tags']:
                        index = sub_labels[0][0]
                        tree.set_label('{}|{}'.format(index, label))
                        return

                    for tag in rule['tags']:
                        if tag in {_tag for _i, _tag in sub_labels}:
                            index = next(_i for _i, _tag in sub_labels
                                         if tag == _tag)
                            tree.set_label('{}|{}'.format(index, label))
                            return

        __iterate(root)
        return root
Ejemplo n.º 3
0
def generate_subtrees(simplified_sentences, full_tree):
    parented_tree = ParentedTree(0, []).convert(full_tree)
    subtrees = []
    for n, sent in enumerate(simplified_sentences):
        new_tree = parented_tree.copy(deep=True)
        new_tree.set_label(f"{new_tree.label()}--extra{n}")
        # delete leafs
        to_del = list(
            reversed([num for num, word in enumerate(sent) if not word]))
        if not to_del:
            continue
        for num in to_del:
            postn = new_tree.leaf_treeposition(num)
            # go up deleting nodes until there are left siblings (we are starting
            while not (new_tree[postn[:-1]].left_sibling()
                       or new_tree[postn[:-1]].right_sibling()):
                postn = postn[:-1]

            del new_tree[postn[:-1]]

        subtrees.append(BoTree(0, []).convert(new_tree))

    return subtrees