def pennconverter_find_heads(tree, head_map=None): if head_map is None: head_map = {} tree = treebanks.remove_coindexation(tree, False) for subtree in tree.subtrees: pennconverter_find_heads(subtree, head_map) if log: print "Head for", tree.span, tree.label # A word is it's own head if tree.word is not None: head = (tree.span, tree.word, tree.label) add_head(head_map, tree, head) return head_map # First handle conjunctions coord = pennconverter_is_coord(tree) if coord: if not add_if_match(tree, {'CC', 'CONJP'}, head_map, True): if not add_if_match(tree, {',', ':'}, head_map, True): add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map # If the label for this node is not in the table we are either at the bottom, # at an NP, or have an error base_label = treebanks.split_label_type_and_function(tree.label)[0] if base_label not in pennconverter_mapping_table: if base_label in ['NP', 'NML']: collins_NP(tree, head_map) elif base_label in ['PP', 'WHPP']: pennconverter_PP(tree, head_map) else: add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map # Look through and take the first/last occurrence that matches info = pennconverter_mapping_table[base_label] for label in info[1]: for i in xrange(len(tree.subtrees)): if info[0] == 'right': i = len(tree.subtrees) - i - 1 subtree = tree.subtrees[i] if isinstance(label, str): if subtree.label == label: add_head(head_map, tree, get_head(head_map, subtree)) return head_map else: if re.match(label, subtree.label) is not None: add_head(head_map, tree, get_head(head_map, subtree)) return head_map # Final fallback if info[0] == 'left': add_head(head_map, tree, get_head(head_map, tree.subtrees[0])) else: add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map
def get_signature(head_map, tree): tree_repr = (tree.span, tree.label) if tree_repr in head_map: return tree_repr tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label) if tree_repr in head_map: return tree_repr tree_repr = (tree.wordspan, treebanks.split_label_type_and_function(tree.label)[0]) if tree_repr in head_map: return tree_repr return None
def get_signature(head_map, tree): tree_repr = (tree.span, tree.label) if tree_repr in head_map: return tree_repr tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label) if tree_repr in head_map: return tree_repr tree_repr = (tree.wordspan, without_func(tree.label)) if tree_repr in head_map: return tree_repr return None
def get_head(head_map, tree, amend_for_trace=False): if not amend_for_trace: tree_repr = (tree.span, tree.label) if tree_repr in head_map: return head_map[tree_repr] tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label) if tree_repr in head_map: return head_map[tree_repr] tree_repr = (tree.wordspan, treebanks.split_label_type_and_function(tree.label)[0]) if tree_repr in head_map: return head_map[tree_repr] return None
def get_head(head_map, tree, amend_for_trace=False): if not amend_for_trace: tree_repr = (tree.span, tree.label) if tree_repr in head_map: return head_map[tree_repr] tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label) if tree_repr in head_map: return head_map[tree_repr] tree_repr = (tree.wordspan, without_func(tree.label)) if tree_repr in head_map: return head_map[tree_repr] return None
def text_tree(tree, single_line=True, show_traces=False, depth=0, dense=True, newline=True, match_ptb=False, prev_and=False): if not show_traces: tree = treebanks.remove_traces(tree, False) tree = treebanks.remove_coindexation(tree, False) ans = '' if not single_line and depth > 0: if newline or (not dense) or tree.word is None or ( match_ptb and (prev_and or tree.word in { 'and', '-RRB-', '-RCB-', '-RSB-', '-LRB-', '-LCB-', '-LSB-' })): if match_ptb: if tree.parent is None or tree.parent.label != 'ROOT': ans = '\n' + depth * ' ' else: ans = '\n' + depth * '\t' else: ans = ' ' if match_ptb and tree.label == 'ROOT': ans += "( " else: ans += '(' + tree.label if tree.word is not None: ans += ' ' + tree.word newline = True else: newline = False prev_and = False for subtree in tree.subtrees: if single_line: ans += ' ' ans += text_tree(subtree, single_line, True, depth + 1, dense, newline, match_ptb, prev_and) prev_and = (subtree.word == 'and') newline = subtree.word is None if tree.word is None and dense and tree.subtrees[ -1].word is not None and ans[-1] != ' ': ans += ' ' ans += ')' return ans
def find_heads(tree, style, head_map=None): mapping = pennconverter_mapping_table if style == 'collins': mapping = collins_mapping_table elif 'jkk' in style: mapping = jkk_mapping_table if head_map is None: head_map = {} tree = treebanks.remove_coindexation(tree, False) for subtree in tree.subtrees: find_heads(subtree, style, head_map) if log: print "Head for", tree.span, tree.label # A word is it's own head if tree.word is not None: head = (tree.span, tree.word, tree.label) add_head(head_map, tree, head) return head_map # First handle conjunctions coord = pennconverter_is_coord(tree) if coord and style == 'pennconverter': if not add_if_match(tree, {'CC', 'CONJP'}, head_map, True): if not add_if_match(tree, {',', ':'}, head_map, True): add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map collins_coord = False for subtree in tree.subtrees: if subtree.label.startswith('CC'): if len(subtree.label) > 2: collins_coord = True elif 'jkk' in style and len(tree.subtrees) > 2: collins_coord = True if len(tree.subtrees) > 2 and without_func(tree.subtrees[0].label) == without_func(tree.label) and tree.subtrees[1].label == 'CC': collins_coord = True flat_NP = False if tree.label == "NP": flat_NP = True for subtree in tree.subtrees: if len(subtree.subtrees) != 0: flat_NP = False if (not flat_NP) and collins_coord and (style == 'collins' or 'jkk' in style): if log: print "doing coord special case" # Options: # 0 - First non-punct (collins) # 1 - First conjunction # 2 - First non-punct non-conjunction # 3 - Last non-punct # 4 - Last conjunction # 5 - Last non-punct non-conjunction for i in xrange(len(tree.subtrees)): subtree = tree.subtrees[i] if style == 'collins' or style == 'jkk' or style[-1] == '0': if not subtree.is_punct(): if log: print "Match backup" add_head(head_map, tree, get_head(head_map, subtree)) return head_map elif style[-1] == '1': if subtree.is_conjunction(): add_head(head_map, tree, get_head(head_map, subtree)) return head_map elif style[-1] == '2': if not (subtree.is_conjunction() or subtree.is_punct()): add_head(head_map, tree, get_head(head_map, subtree)) return head_map subtree = tree.subtrees[len(tree.subtrees) - i - 1] if style[-1] == '3': if not subtree.is_punct(): add_head(head_map, tree, get_head(head_map, subtree)) return head_map elif style[-1] == '4': if subtree.is_conjunction(): add_head(head_map, tree, get_head(head_map, subtree)) return head_map elif style[-1] == '5': if not (subtree.is_conjunction() or subtree.is_punct()): add_head(head_map, tree, get_head(head_map, subtree)) return head_map if log: print "coord special case didn't find a head" # If the label for this node is not in the table we are either at the bottom, # at an NP, or have an error base_label = without_func(tree.label) if base_label not in mapping or base_label in ['NP', 'NML']: if base_label in ['NP', 'NML']: if log: print "doing collins NP" collins_NP(tree, head_map) elif base_label in ['PP', 'WHPP']: pennconverter_PP(tree, head_map) else: add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map # Look through and take the first/last occurrence that matches info = mapping[base_label] for label in info[1]: for i in xrange(len(tree.subtrees)): if info[0] == 'right': i = len(tree.subtrees) - i - 1 subtree = tree.subtrees[i] if isinstance(label, str): ### if subtree.label == label: ### if log: print "Match add 1" ### add_head(head_map, tree, get_head(head_map, subtree)) ### return head_map if label in special_cases: if subtree.word in special_cases[label]: if log: print "Match add 1a" add_head(head_map, tree, get_head(head_map, subtree)) return head_map elif without_func(subtree.label) == label: if 'aux' not in info[1] or subtree.word not in special_cases['aux']: if log: print "Match add 1" add_head(head_map, tree, get_head(head_map, subtree)) return head_map else: if re.match(label, without_func(subtree.label)) is not None: if log: print "Match add 2" add_head(head_map, tree, get_head(head_map, subtree)) return head_map # Fallback, no punct for i in xrange(len(tree.subtrees)): if info[0] == 'right': i = len(tree.subtrees) - i - 1 subtree = tree.subtrees[i] if not subtree.is_punct(): if not (len(subtree.label) > 2 and subtree.label.startswith('CC')): if log: print "Match backup" add_head(head_map, tree, get_head(head_map, subtree)) return head_map # Final fallback if info[0] == 'left': if log: print "Fallback add 1" add_head(head_map, tree, get_head(head_map, tree.subtrees[0])) else: if log: print "Fallback add 2" add_head(head_map, tree, get_head(head_map, tree.subtrees[-1])) return head_map