def __iterate(tree: ParentedTree, index: int = 1): # clean the tags which contains '-' if '-' in tree.label(): tree.set_label(tree.label().split('-')[0]) if _is_leaf(tree): yield tree.label(), tree[0] # (tag, token) tree[0] = index # replace the token with its index number index += 1 else: for subtree in tree: for _item in __iterate(subtree, index): yield _item index += 1
def __iterate(tree: ParentedTree): label = tree.label() if _is_leaf(tree): tree.set_label('{}|{}'.format(tree[0], label)) else: for subtree in tree: __iterate(subtree) # just select the last one as the head if the tag is not covered by the head rules if label not in self.head_rules: index = tree[-1].label().split('|')[0] tree.set_label('{}|{}'.format(index, label)) return for rule in self.head_rules[label]: sub_labels = [t.label().split('|') for t in tree] if rule['direction'] == 'r': sub_labels = sub_labels[::-1] # reverse # this is the last rule, just select the first or last one as the head if not rule['tags']: index = sub_labels[0][0] tree.set_label('{}|{}'.format(index, label)) return for tag in rule['tags']: if tag in {_tag for _i, _tag in sub_labels}: index = next(_i for _i, _tag in sub_labels if tag == _tag) tree.set_label('{}|{}'.format(index, label)) return