コード例 #1
0
ファイル: head_finder.py プロジェクト: jkkummerfeld/nlp-util
def pennconverter_find_heads(tree, head_map=None):
  if head_map is None:
    head_map = {}
    tree = treebanks.remove_coindexation(tree, False)
  for subtree in tree.subtrees:
    pennconverter_find_heads(subtree, head_map)

  if log: print "Head for", tree.span, tree.label

  # A word is it's own head
  if tree.word is not None:
    head = (tree.span, tree.word, tree.label)
    add_head(head_map, tree, head)
    return head_map

  # First handle conjunctions
  coord = pennconverter_is_coord(tree)
  if coord:
    if not add_if_match(tree, {'CC', 'CONJP'}, head_map, True):
      if not add_if_match(tree, {',', ':'}, head_map, True):
        add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))
    return head_map
  
  # If the label for this node is not in the table we are either at the bottom,
  # at an NP, or have an error
  base_label = treebanks.split_label_type_and_function(tree.label)[0]
  if base_label not in pennconverter_mapping_table:
    if base_label in ['NP', 'NML']:
      collins_NP(tree, head_map)
    elif base_label in ['PP', 'WHPP']:
      pennconverter_PP(tree, head_map)
    else:
      add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))
    return head_map
  
  # Look through and take the first/last occurrence that matches
  info = pennconverter_mapping_table[base_label]
  for label in info[1]:
    for i in xrange(len(tree.subtrees)):
      if info[0] == 'right':
        i = len(tree.subtrees) - i - 1
      subtree = tree.subtrees[i]
      if isinstance(label, str):
        if subtree.label == label:
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      else:
        if re.match(label, subtree.label) is not None:
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map

  # Final fallback
  if info[0] == 'left':
    add_head(head_map, tree, get_head(head_map, tree.subtrees[0]))
  else:
    add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))

  return head_map
コード例 #2
0
ファイル: head_finder.py プロジェクト: jkkummerfeld/nlp-util
def get_signature(head_map, tree):
  tree_repr = (tree.span, tree.label)
  if tree_repr in head_map:
    return tree_repr
  tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label)
  if tree_repr in head_map:
    return tree_repr
  tree_repr = (tree.wordspan, treebanks.split_label_type_and_function(tree.label)[0])
  if tree_repr in head_map:
    return tree_repr
  return None
コード例 #3
0
def get_signature(head_map, tree):
  tree_repr = (tree.span, tree.label)
  if tree_repr in head_map:
    return tree_repr
  tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label)
  if tree_repr in head_map:
    return tree_repr
  tree_repr = (tree.wordspan, without_func(tree.label))
  if tree_repr in head_map:
    return tree_repr
  return None
コード例 #4
0
ファイル: head_finder.py プロジェクト: jkkummerfeld/nlp-util
def get_head(head_map, tree, amend_for_trace=False):
  if not amend_for_trace:
    tree_repr = (tree.span, tree.label)
    if tree_repr in head_map:
      return head_map[tree_repr]
  tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label)
  if tree_repr in head_map:
    return head_map[tree_repr]
  tree_repr = (tree.wordspan, treebanks.split_label_type_and_function(tree.label)[0])
  if tree_repr in head_map:
    return head_map[tree_repr]
  return None
コード例 #5
0
def get_head(head_map, tree, amend_for_trace=False):
  if not amend_for_trace:
    tree_repr = (tree.span, tree.label)
    if tree_repr in head_map:
      return head_map[tree_repr]
  tree_repr = (tree.wordspan, treebanks.remove_coindexation(tree, False).label)
  if tree_repr in head_map:
    return head_map[tree_repr]
  tree_repr = (tree.wordspan, without_func(tree.label))
  if tree_repr in head_map:
    return head_map[tree_repr]
  return None
コード例 #6
0
def text_tree(tree,
              single_line=True,
              show_traces=False,
              depth=0,
              dense=True,
              newline=True,
              match_ptb=False,
              prev_and=False):
    if not show_traces:
        tree = treebanks.remove_traces(tree, False)
        tree = treebanks.remove_coindexation(tree, False)
    ans = ''
    if not single_line and depth > 0:
        if newline or (not dense) or tree.word is None or (
                match_ptb and (prev_and or tree.word in {
                    'and', '-RRB-', '-RCB-', '-RSB-', '-LRB-', '-LCB-', '-LSB-'
                })):
            if match_ptb:
                if tree.parent is None or tree.parent.label != 'ROOT':
                    ans = '\n' + depth * '  '
            else:
                ans = '\n' + depth * '\t'
        else:
            ans = ' '
    if match_ptb and tree.label == 'ROOT':
        ans += "( "
    else:
        ans += '(' + tree.label
    if tree.word is not None:
        ans += ' ' + tree.word
        newline = True
    else:
        newline = False
    prev_and = False
    for subtree in tree.subtrees:
        if single_line:
            ans += ' '
        ans += text_tree(subtree, single_line, True, depth + 1, dense, newline,
                         match_ptb, prev_and)
        prev_and = (subtree.word == 'and')
        newline = subtree.word is None
    if tree.word is None and dense and tree.subtrees[
            -1].word is not None and ans[-1] != ' ':
        ans += ' '
    ans += ')'
    return ans
コード例 #7
0
def find_heads(tree, style, head_map=None):
  mapping = pennconverter_mapping_table
  if style == 'collins':
    mapping = collins_mapping_table
  elif 'jkk' in style:
    mapping = jkk_mapping_table
  if head_map is None:
    head_map = {}
    tree = treebanks.remove_coindexation(tree, False)
  for subtree in tree.subtrees:
    find_heads(subtree, style, head_map)

  if log: print "Head for", tree.span, tree.label

  # A word is it's own head
  if tree.word is not None:
    head = (tree.span, tree.word, tree.label)
    add_head(head_map, tree, head)
    return head_map

  # First handle conjunctions
  coord = pennconverter_is_coord(tree)
  if coord and style == 'pennconverter':
    if not add_if_match(tree, {'CC', 'CONJP'}, head_map, True):
      if not add_if_match(tree, {',', ':'}, head_map, True):
        add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))
    return head_map
  
  collins_coord = False
  for subtree in tree.subtrees:
    if subtree.label.startswith('CC'):
      if len(subtree.label) > 2:
        collins_coord = True
      elif 'jkk' in style and len(tree.subtrees) > 2:
        collins_coord = True
  if len(tree.subtrees) > 2 and without_func(tree.subtrees[0].label) == without_func(tree.label) and tree.subtrees[1].label == 'CC':
    collins_coord = True
  flat_NP = False
  if tree.label == "NP":
    flat_NP = True
    for subtree in tree.subtrees:
      if len(subtree.subtrees) != 0:
        flat_NP = False
  if (not flat_NP) and collins_coord and (style == 'collins' or 'jkk' in style):
    if log: print "doing coord special case"
    # Options:
    # 0 - First non-punct (collins)
    # 1 - First conjunction
    # 2 - First non-punct non-conjunction
    # 3 - Last non-punct
    # 4 - Last conjunction
    # 5 - Last non-punct non-conjunction
    for i in xrange(len(tree.subtrees)):
      subtree = tree.subtrees[i]
      if style == 'collins' or style == 'jkk' or style[-1] == '0':
        if not subtree.is_punct():
          if log: print "Match backup"
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      elif style[-1] == '1':
        if subtree.is_conjunction():
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      elif style[-1] == '2':
        if not (subtree.is_conjunction() or subtree.is_punct()):
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      subtree = tree.subtrees[len(tree.subtrees) - i - 1]
      if style[-1] == '3':
        if not subtree.is_punct():
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      elif style[-1] == '4':
        if subtree.is_conjunction():
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
      elif style[-1] == '5':
        if not (subtree.is_conjunction() or subtree.is_punct()):
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map
    if log: print "coord special case didn't find a head"

  # If the label for this node is not in the table we are either at the bottom,
  # at an NP, or have an error
  base_label = without_func(tree.label)
  if base_label not in mapping or base_label in ['NP', 'NML']:
    if base_label in ['NP', 'NML']:
      if log: print "doing collins NP"
      collins_NP(tree, head_map)
    elif base_label in ['PP', 'WHPP']:
      pennconverter_PP(tree, head_map)
    else:
      add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))
    return head_map
  
  # Look through and take the first/last occurrence that matches
  info = mapping[base_label]
  for label in info[1]:
    for i in xrange(len(tree.subtrees)):
      if info[0] == 'right':
        i = len(tree.subtrees) - i - 1
      subtree = tree.subtrees[i]
      if isinstance(label, str):
###        if subtree.label == label:
###          if log: print "Match add 1"
###          add_head(head_map, tree, get_head(head_map, subtree))
###          return head_map
        if label in special_cases:
          if subtree.word in special_cases[label]:
            if log: print "Match add 1a"
            add_head(head_map, tree, get_head(head_map, subtree))
            return head_map
        elif without_func(subtree.label) == label:
          if 'aux' not in info[1] or subtree.word not in special_cases['aux']:
            if log: print "Match add 1"
            add_head(head_map, tree, get_head(head_map, subtree))
            return head_map
      else:
        if re.match(label, without_func(subtree.label)) is not None:
          if log: print "Match add 2"
          add_head(head_map, tree, get_head(head_map, subtree))
          return head_map

  # Fallback, no punct
  for i in xrange(len(tree.subtrees)):
    if info[0] == 'right':
      i = len(tree.subtrees) - i - 1
    subtree = tree.subtrees[i]
    if not subtree.is_punct():
      if not (len(subtree.label) > 2 and subtree.label.startswith('CC')):
        if log: print "Match backup"
        add_head(head_map, tree, get_head(head_map, subtree))
        return head_map

  # Final fallback
  if info[0] == 'left':
    if log: print "Fallback add 1"
    add_head(head_map, tree, get_head(head_map, tree.subtrees[0]))
  else:
    if log: print "Fallback add 2"
    add_head(head_map, tree, get_head(head_map, tree.subtrees[-1]))
  return head_map