def __iterate(tree: ParentedTree, index: int = 1): # clean the tags which contains '-' if '-' in tree.label(): tree.set_label(tree.label().split('-')[0]) if _is_leaf(tree): yield tree.label(), tree[0] # (tag, token) tree[0] = index # replace the token with its index number index += 1 else: for subtree in tree: for _item in __iterate(subtree, index): yield _item index += 1
def __iterate(tree: ParentedTree): label = tree.label() if _is_leaf(tree): tree.set_label('{}|{}'.format(tree[0], label)) else: for subtree in tree: __iterate(subtree) # just select the last one as the head if the tag is not covered by the head rules if label not in self.head_rules: index = tree[-1].label().split('|')[0] tree.set_label('{}|{}'.format(index, label)) return for rule in self.head_rules[label]: sub_labels = [t.label().split('|') for t in tree] if rule['direction'] == 'r': sub_labels = sub_labels[::-1] # reverse # this is the last rule, just select the first or last one as the head if not rule['tags']: index = sub_labels[0][0] tree.set_label('{}|{}'.format(index, label)) return for tag in rule['tags']: if tag in {_tag for _i, _tag in sub_labels}: index = next(_i for _i, _tag in sub_labels if tag == _tag) tree.set_label('{}|{}'.format(index, label)) return
def _find_special_nodes(self, node: ParentedTree): #IMP assumption: dealing with a tree #as such dont have to keep track of visited nodes #since each node has only one parent if not type(node) is str: #print(node.treeposition()) if (node.label() not in self.look_for) and (node.label() not in self.avoid): res = {self._find_special_nodes(n) for n in node} if 0 in res: return 0 else: return 2 elif node.label() in self.avoid: {self._find_special_nodes(n) for n in node} return 0 elif node.label() in self.look_for: res = {self._find_special_nodes(n) for n in node} if 0 in res: return 0 else: self.special_labels.add(node.treeposition()) return 1 else: return 2
def __iterate(tree: ParentedTree): # the index of the current node parent_index = tree.label().split('|')[0] # if this is the root node, yield a index -> 0 relation if not tree.parent(): yield parent_index, 0 if not _is_leaf(tree): for subtree in tree: index = subtree.label().split('|')[0] if index != parent_index: yield index, parent_index for _item in __iterate(subtree): yield _item
def traverse_and_store(self, tree: ParentedTree, parse_tree_stored: List[Dict]): label = tree.label() words = [x.split('_')[0] for x in tree.leaves()] indices = [int(x.split('_')[-1]) for x in tree.leaves()] ngram_info = len(words) words = " ".join(words) if tree.height() > self.TREE_HEIGHT and ngram_info < self.NGRAM_LIMIT: parse_tree_stored.append({ 'phrase_label': label, 'phrase': words, 'ngram': ngram_info, 'indices': indices }) for subtree in tree: if type(subtree) == ParentedTree: self.traverse_and_store(tree=subtree, parse_tree_stored=parse_tree_stored) return parse_tree_stored
def IsAnArray(tree:ParentedTree): return tree.label() == 'FiguresArray'
def IsConcatenated(tree:ParentedTree): return tree.label() == 'Concatenated'