def _preprosess(root: ParentedTree): """ Preprocesses the lexcial tree: clean the syntactic tags and replace each token value with its index number. :param root: the root of the lexcial tree :return: a tuple of a processed tree and a sequence of (tag, token) """ root: ParentedTree = root.copy(deep=True) def __iterate(tree: ParentedTree, index: int = 1): # clean the tags which contains '-' if '-' in tree.label(): tree.set_label(tree.label().split('-')[0]) if _is_leaf(tree): yield tree.label(), tree[0] # (tag, token) tree[0] = index # replace the token with its index number index += 1 else: for subtree in tree: for _item in __iterate(subtree, index): yield _item index += 1 # i.e. [('NR', '上海'), ('NR', '浦东'), ('NN', '开发'), ('CC', '与'), ...] sequences = [i for i in __iterate(root)] return root, sequences
def __mark_heads(self, root: ParentedTree): """ Marks the head of each phrase. :param root: a preprocessed phrase tree. :return: a phrase tree with head labels """ root: ParentedTree = root.copy(deep=True) def __iterate(tree: ParentedTree): label = tree.label() if _is_leaf(tree): tree.set_label('{}|{}'.format(tree[0], label)) else: for subtree in tree: __iterate(subtree) # just select the last one as the head if the tag is not covered by the head rules if label not in self.head_rules: index = tree[-1].label().split('|')[0] tree.set_label('{}|{}'.format(index, label)) return for rule in self.head_rules[label]: sub_labels = [t.label().split('|') for t in tree] if rule['direction'] == 'r': sub_labels = sub_labels[::-1] # reverse # this is the last rule, just select the first or last one as the head if not rule['tags']: index = sub_labels[0][0] tree.set_label('{}|{}'.format(index, label)) return for tag in rule['tags']: if tag in {_tag for _i, _tag in sub_labels}: index = next(_i for _i, _tag in sub_labels if tag == _tag) tree.set_label('{}|{}'.format(index, label)) return __iterate(root) return root
def generate_subtrees(simplified_sentences, full_tree): parented_tree = ParentedTree(0, []).convert(full_tree) subtrees = [] for n, sent in enumerate(simplified_sentences): new_tree = parented_tree.copy(deep=True) new_tree.set_label(f"{new_tree.label()}--extra{n}") # delete leafs to_del = list( reversed([num for num, word in enumerate(sent) if not word])) if not to_del: continue for num in to_del: postn = new_tree.leaf_treeposition(num) # go up deleting nodes until there are left siblings (we are starting while not (new_tree[postn[:-1]].left_sibling() or new_tree[postn[:-1]].right_sibling()): postn = postn[:-1] del new_tree[postn[:-1]] subtrees.append(BoTree(0, []).convert(new_tree)) return subtrees