def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index): leaf = Node() leaf.word = word_list[index] leaf.pos = pos_list[index] leaf.span = (index, index + 1) leaf.head = leaf.word root = leaf for child_index in head_list[index]: child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list, child_index) new_root = Node() new_root.word = None new_root.pos = relation_list[child_index] if child_index < index: new_root.span = (child_root.span[0], root.span[1]) new_root.add_child(child_root) new_root.add_child(root) else: new_root.span = (root.span[0], child_root.span[1]) new_root.add_child(root) new_root.add_child(child_root) new_root.head = root.head root = new_root return root
def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index): leaf = Node() leaf.word = word_list[index] leaf.pos = pos_list[index] leaf.span = (index, index+1) leaf.head = leaf.word root = leaf for child_index in head_list[index]: child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list, child_index) new_root = Node() new_root.word = None new_root.pos = relation_list[child_index] if child_index < index: new_root.span = (child_root.span[0], root.span[1]) new_root.add_child(child_root) new_root.add_child(root) else: new_root.span = (root.span[0], child_root.span[1]) new_root.add_child(root) new_root.add_child(child_root) new_root.head = root.head root = new_root return root
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): node_list = [] max_dense_span = 3 # Start from bigram, since all unigrams are already covered by parses for span_length in range(2, 1 + max_dense_span): for span_start in range(0, 1 + len(text_raw_data) - span_length): span = (span_start, span_start + span_length) if span in span_to_node: continue pos = "NONE" ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # span, child # TODO: sibling node = Node(family=1) node_list.append(node) node.span = span node.span_length = span_length span_to_node[span] = node node.child_list = [ span_to_node[(span[0], span[1] - 1)], span_to_node[(span[0] + 1, span[1])] ] # word, head, pos node.pos_index = pos_to_index[pos] pos_count[pos] += 1 node.word_split = [] node.word_index = -1 node.head_split = [] node.head_index = -1 # ne node.ne = ne if ne != "NONE": ne_count[ne] += 1 pos_ne_count[pos] += 1 # lexicon node.lexicon_hit = [0] * len(lexicon_list) hits = 0 for index, lexicon in enumerate(lexicon_list): if constituent in lexicon: lexicon[constituent] += 1 node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits return node_list
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): node_list = [] max_dense_span = 3 # Start from bigram, since all unigrams are already covered by parses for span_length in range(2, 1+max_dense_span): for span_start in range(0, 1+len(text_raw_data)-span_length): span = (span_start, span_start+span_length) if span in span_to_node: continue pos = "NONE" ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # span, child # TODO: sibling node = Node(family=1) node_list.append(node) node.span = span node.span_length = span_length span_to_node[span] = node node.child_list = [span_to_node[(span[0],span[1]-1)], span_to_node[(span[0]+1,span[1])]] # word, head, pos node.pos_index = pos_to_index[pos] pos_count[pos] += 1 node.word_split = [] node.word_index = -1 node.head_split = [] node.head_index = -1 # ne node.ne = ne if ne != "NONE": ne_count[ne] += 1 pos_ne_count[pos] += 1 # lexicon node.lexicon_hit = [0] * len(lexicon_list) hits = 0 for index, lexicon in enumerate(lexicon_list): if constituent in lexicon: lexicon[constituent] += 1 node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits return node_list