def add_node(tree, span, label, position=0, in_place=True): '''Introduce a new node in the tree. Position indicates what to do when a node already exists with the same span. Zero indicates above any current nodes, one indicates beneath the first, and so on.''' tree = tree.root() if not in_place: tree = tree.clone() # Find the node(s) that should be within the new span nodes = tree.get_spanning_nodes(*span) # Do not operate on the root node if nodes[0].parent is None: nodes = nodes[0].subtrees[:] for i in xrange(position): if len(nodes) > 1: return (False, "Position {} is too deep".format(position)) nodes[0] = nodes[0].subtrees[0] nodes.sort(key=lambda x: x.span) # Check that all of the nodes are at the same level parent = None for node in nodes: if parent is None: parent = node.parent if parent != node.parent: return (False, "The span ({} - {}) would cross brackets".format(*span)) # Create the node nnode = pstree.PSTree(None, label, span, parent) position = parent.subtrees.index(nodes[0]) parent.subtrees.insert(position, nnode) # Move the subtrees for node in nodes: node.parent.subtrees.remove(node) nnode.subtrees.append(node) node.parent = nnode return (True, (tree, nnode))
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): pos = tree.label word = tree.word span = tree.span head = tree.head if hasattr(tree, "head") else head_raw_data[(span, pos)][1] ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # Process pos info node.pos_index = pos_to_index[pos] pos_count[pos] += 1 # Process word info node.word_split = [character_to_index[character] for character in word] if word else [] node.word_index = word_to_index[word] if word else -1 # Process head info node.head_split = [character_to_index[character] for character in head] #if head == "-LSB-": print text_raw_data node.head_index = word_to_index[head] # Process ne info node.ne = ne if ne != "NONE": if not node.parent or node.parent.span!=span: ne_count[ne] += 1 pos_ne_count[pos] += 1 # Process span info node.span = span span_to_node[span] = node # Process lexicon info node.lexicon_hit = [0] * len(index_to_lexicon) hits = 0 for index, lexicon in index_to_lexicon.iteritems(): if constituent in lexicon: node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits # Binarize children if len(tree.subtrees) > 2: side_child_pos = tree.subtrees[-1].label side_child_span = tree.subtrees[-1].span side_child_head = head_raw_data[(side_child_span, side_child_pos)][1] if side_child_head != head: sub_subtrees = tree.subtrees[:-1] else: sub_subtrees = tree.subtrees[1:] new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1]) new_tree = pstree.PSTree(label=pos, span=new_span, subtrees=sub_subtrees) new_tree.head = head if side_child_head != head: tree.subtrees = [new_tree, tree.subtrees[-1]] else: tree.subtrees = [tree.subtrees[0], new_tree] # Process children nodes = 1 for subtree in tree.subtrees: child = Node() node.add_child(child) child_nodes = construct_node(child, subtree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) nodes += child_nodes return nodes