Beispiel #1
0
def get_sentence(
    begin_char_offsets,
    end_char_offsets,
    words,
    lemmas,
    poses,
    dependencies,
    ners,
    dep_format_parser=dep_graph_parser_parenthesis,
):
    """Return a list of Word objects representing a sentence.

    This is effectively a wrapper around unpack_words, but with a less
    cumbersome interface.

    Args:
        begin_char_offsets: a list representing the beginning character offset
            for each word in the sentence
        end_char_offsets: a list representing the end character offset for each
            word in the sentence
        words: a list of the words in the sentence
        lemmas: a list of the lemmas of the words in the sentence
        poses: a list of the POS tags of the words in the sentence
        dependencies: a list of the dependency path edges for the sentence
        ners: a list of the NER tags of the words in the sentence
        dep_format_parse: a function that takes as only argument an element of
            dependencies (i.e., a dependency path edge) and returns a 3-tuple
            (parent_index, label, child_index) representing the edge. Look at
            the code for dep_graph_parser_parenthesis and
            dep_graph_parser_triplet for examples.
    """
    obj = dict()
    obj["lemma"] = lemmas
    obj["words"] = words
    obj["ner"] = ners
    obj["pos"] = poses
    obj["dep_graph"] = dependencies
    obj["ch_of_beg"] = begin_char_offsets
    obj["ch_of_end"] = end_char_offsets
    # list of Word objects
    word_obj_list = unpack_words(
        obj,
        character_offset_begin="ch_of_beg",
        character_offset_end="ch_of_end",
        lemma="lemma",
        pos="pos",
        ner="ner",
        words="words",
        dep_graph="dep_graph",
        dep_graph_parser=dep_format_parser,
    )
    return word_obj_list
def get_recurrent_features_new(row):
  #print row
  line = row.strip().split('\t')
  dep_graph_str = string.replace(line[8], '\\t', '\t')
  dep_graph_str = string.replace(dep_graph_str, '\\n', '\n')
  #dep_graph_str = string.replace(dep_graph_str, '\\\'', '\'')
  lemma_str = line[7]
  words_str = line[6][1:-1]
  words_str = string.replace(words_str, "\",\"", "~^~")
  # skip sentences with empty dependency graphs
  #if dep_graph_str == "":
  #  return ""
  #types = [line[9], line[13]]
  types = [None, None]
  starts = [line[0], line[3]]
  ends = [line[1], line[4]]
  lemma = lemma_str.split(ARR_DELIM)
  dep_graph = dep_graph_str.split("\n")
  #PATTERN = re.compile(r'''((?:"[^"]*")+)''')
  #words = PATTERN.split(words_str[1:-1])[1::2]
  words = words_str.split(",")
  for i,word in enumerate(words):
	if word == "~^~":
		words[i] = ','
  mention_ids = [line[2], line[5]]
  mention_words = [[words[int(starts[0]): int(ends[0])]],[words[int(starts[1]):int(ends[1])]]]
  # create a list of mentions
  mentions = zip(mention_ids, mention_words, types, starts, ends)
  mentions = map(lambda x: {"mention_id" : x[0], "word" : x[1], "type" : x[2], "start" : int(x[3]), "end" : int(x[4])}, mentions)
  relation = None
  if len(line) >= 10:
	relation = line[len(line)-1].strip()
  #now we get the path from both mentions to the root
  # get a list of Word object
  obj = {}
  obj['lemma'] = lemma
  obj['words'] = words
  obj['dep_graph'] = dep_graph
  word_obj_list = ddlib.unpack_words(obj, lemma='lemma', words='words', dep_graph='dep_graph', dep_graph_parser=dep_format_parser)
  
  m1 = mentions[0]
  m2 = mentions[1]
  #print row
  #if m1["mention_id"] != m2["mention_id"]:
  link, path = ddlib.dep_path_between_words_new(word_obj_list, int(ends[0])-1, int(ends[1])-1)
  feat = [m1["mention_id"], m2["mention_id"], m1["type"], m2["type"], path, link]
  if relation is not None:
	feat.append(relation)
  return feat
Beispiel #3
0
def get_sentence(begin_char_offsets,
                 end_char_offsets,
                 words,
                 lemmas,
                 poses,
                 dependencies,
                 ners,
                 dep_format_parser=dep_graph_parser_parenthesis):
    """Return a list of Word objects representing a sentence.

    This is effectively a wrapper around unpack_words, but with a less
    cumbersome interface.

    Args:
        begin_char_offsets: a list representing the beginning character offset
            for each word in the sentence
        end_char_offsets: a list representing the end character offset for each
            word in the sentence
        words: a list of the words in the sentence
        lemmas: a list of the lemmas of the words in the sentence
        poses: a list of the POS tags of the words in the sentence
        dependencies: a list of the dependency path edges for the sentence
        ners: a list of the NER tags of the words in the sentence
        dep_format_parse: a function that takes as only argument an element of
            dependencies (i.e., a dependency path edge) and returns a 3-tuple
            (parent_index, label, child_index) representing the edge. Look at
            the code for dep_graph_parser_parenthesis and
            dep_graph_parser_triplet for examples.
    """
    obj = dict()
    obj['lemma'] = lemmas
    obj['words'] = words
    obj['ner'] = ners
    obj['pos'] = poses
    obj['dep_graph'] = dependencies
    obj['ch_of_beg'] = begin_char_offsets
    obj['ch_of_end'] = end_char_offsets
    # list of Word objects
    word_obj_list = unpack_words(obj,
                                 character_offset_begin='ch_of_beg',
                                 character_offset_end='ch_of_end',
                                 lemma='lemma',
                                 pos='pos',
                                 ner='ner',
                                 words='words',
                                 dep_graph='dep_graph',
                                 dep_graph_parser=dep_format_parser)
    return word_obj_list
def get_recurrent_features(row):
  line = row.strip().split('\t')
  dep_graph_str = string.replace(line[1], '\\t', '\t')
  dep_graph_str = string.replace(dep_graph_str, '\\n', '\n')
  #dep_graph_str = string.replace(dep_graph_str, '\\\'', '\'')
  lemma_str = line[3]
  words_str = line[2]
  words_str = string.replace(words_str, "\",\"", "~^~")
  # skip sentences with empty dependency graphs
  #if dep_graph_str == "":
  #  return ""
  types = [line[9], line[13]]
  starts = [line[14], line[16]]
  ends = [line[15], line[17]]
  lemma = lemma_str.split(ARR_DELIM)
  dep_graph = dep_graph_str.split("\n")
  #PATTERN = re.compile(r'''((?:"[^"]*")+)''')
  #words = PATTERN.split(words_str[1:-1])[1::2]
  words = words_str.split(",")
  for i,word in enumerate(words):
	if word == "~^~":
		words[i] = ','
  mention_ids = [line[7], line[11]]
  mention_words = [[words[int(starts[0]): int(ends[0])]],[words[int(starts[1]):int(ends[1])]]]
  # create a list of mentions
  mentions = zip(mention_ids, mention_words, types, starts, ends)
  mentions = map(lambda x: {"mention_id" : x[0], "word" : x[1], "type" : x[2], "start" : int(x[3]), "end" : int(x[4])}, mentions)

  relation = None
  if len(line) == 21:
	relation = line[18]
  # get a list of Word objects
  obj = {}
  obj['lemma'] = lemma
  obj['words'] = words
  obj['dep_graph'] = dep_graph
  word_obj_list = ddlib.unpack_words(obj, lemma='lemma', words='words', dep_graph='dep_graph', dep_graph_parser=dep_format_parser)
  # at this point we have a list of the mentions in this sentence

  # go through all pairs of mentions
  for m1 in mentions:
    start1 = m1["start"]
    end1 = m1["end"]

    #if m1["type"] not in ["PERSON", "ORGANIZATION"]:
    #  continue

    for m2 in mentions:
      #if m1["mention_id"] == m2["mention_id"]:
        #continue

      start2 = m2["start"]
      end2 = m2["end"]

      edges = ddlib.dep_path_between_words(word_obj_list, end1 - 1, end2 - 1)
      #print edges
      if len(edges) > 0:
        num_roots = 0 # the number of root nodes
        num_left = 0 # the number of edges to the left of the root
        num_right = 0 # the number of edges to the right of the root
        left_path = "" # the dependency path to the left of the root
        right_path = "" # the dependency path to the right of the root

        # find the index of the switch from up to down
        switch_direction_index = -1
        for i in range(len(edges)):
          if not edges[i].is_bottom_up:
            switch_direction_index = i
            break
        
        # iterate through the edge list
        for i in range(len(edges)):
          curr_edge = edges[i]

          # count the number of roots; if there are more than 1 root then our dependency
          # path is disconnected
          if curr_edge.label == 'ROOT':
            num_roots += 1

          # going from the left to the root
          if curr_edge.is_bottom_up:
            num_left += 1

            # if this is the edge pointing to the root (word2 is the root)
            if i == switch_direction_index - 1:
              left_path = left_path + ("--" + curr_edge.label + "->")
              root = curr_edge.word2.lemma.lower()
	      #root = curr_edge.word2.word
            # this edge does not point to the root
            else:
              # if we are at the last edge, don't include the word (part of the mention)
              if i == len(edges) - 1:
                left_path = left_path + ("--" + curr_edge.label + "->")
              else:
                left_path = left_path + ("--" + curr_edge.label + "->" + curr_edge.word2.lemma.lower())
	        #left_path = left_path + ("--" + curr_edge.label + "->" + curr_edge.word2.word)

          # going from the root to the right
          else:
            num_right += 1

            # the first edge to the right of the root
            if i == switch_direction_index:
              right_path = right_path + "<-" + curr_edge.label + "--"
	      #right_path = right_path + "<-" + curr_edge.label + "--"

            # this edge does not point from the root
            else:
              # if we are at the first edge, don't include the word (part of the mention)
              if i == 0:
                right_path = right_path + ("<-" + curr_edge.label + "--")
              else:
                # word1 is the parent for right to left
                right_path = right_path + (curr_edge.word1.lemma.lower() + "<-" + curr_edge.label + "--")
		#right_path = right_path + (curr_edge.word1.word + "<-" + curr_edge.label + "--")

        # if the root is at the end or at the beginning (direction was all up or all down)
        if num_right == 0:
          root = "|SAMEPATH"
        elif num_left == 0:
          root = "SAMEPATH|"

        # if the edges have a disconnect
        elif num_roots > 1:
          root = "|NONEROOT|"

        # this is a normal tree with a connected root in the middle
        else:
          root = "|" + root + "|"

        path = left_path + root + right_path

        feat = [m1["word"], m2["word"], m1["type"], m2["type"], path]
        # make sure each of the strings we will output is encoded as utf-8
	if relation is not None:
		feat.append(relation[1:-1])
	return feat
  return [m1["word"], m2["word"], m1["type"], m2["type"], ""]