def get_tree_data_new(raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list): log("get_tree_data()...") """ Get tree structured data from CoNLL 2012 Stores into Node data structure """ tree_pyramid_list = [] ner_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for document in raw_data["gold"]: for part in raw_data["gold"][document]: ner_raw_data = defaultdict(lambda: {}) for k, v in raw_data["gold"][document][part]["ner"].items(): ner_raw_data[k[0]][(k[1], k[2])] = v for index, parse in enumerate( raw_data["gold"][document][part]["parses"]): text_raw_data = raw_data["gold"][document][part]["text"][index] word_count += len(text_raw_data) if parse.subtrees[0].label == "NOPARSE": continue head_raw_data = raw_data["gold"][document][part]["heads"][ index] root_node = Node() span_to_node = {} nodes = construct_node(root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, False) root_node.nodes = nodes root_node.text_raw_data = text_raw_data # YOLO additional_node_list = [] """ additional_node_list = create_dense_nodes( ner_raw_data[index], text_raw_data, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) """ tree_pyramid_list.append((root_node, additional_node_list)) ner_list.append(ner_raw_data[index]) log(" %d sentences\n" % len(tree_pyramid_list)) return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0])
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list): log("get_tree_data()...") """ Get tree structured data from CoNLL 2012 Stores into Node data structure """ tree_pyramid_list = [] ner_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for document in raw_data["auto"]: for part in raw_data["auto"][document]: ner_raw_data = defaultdict(lambda: {}) for k, v in raw_data["gold"][document][part]["ner"].iteritems(): ner_raw_data[k[0]][(k[1], k[2])] = v for index, parse in enumerate(raw_data["auto"][document][part]["parses"]): text_raw_data = raw_data["auto"][document][part]["text"][index] word_count += len(text_raw_data) if parse.subtrees[0].label == "NOPARSE": continue head_raw_data = raw_data["auto"][document][part]["heads"][index] root_node = Node() span_to_node = {} nodes = construct_node( root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, False) root_node.nodes = nodes root_node.text_raw_data = text_raw_data #YOLO additional_node_list = [] """ additional_node_list = create_dense_nodes( ner_raw_data[index], text_raw_data, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) """ tree_pyramid_list.append((root_node, additional_node_list)) ner_list.append(ner_raw_data[index]) log(" %d sentences\n" % len(tree_pyramid_list)) return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0])
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, pos_count, ne_count, pos_ne_count): pos = tree.label word = tree.word span = tree.span head = tree.head if hasattr(tree, "head") else head_raw_data[(span, pos)][1] ne = ner_raw_data[span] if span in ner_raw_data else "NONE" # Process pos info node.pos = pos node.pos_index = pos_to_index[pos] pos_count[pos] += 1 # Process word info node.word_split = [character_to_index[character] for character in word] if word else [] node.word_index = word_to_index[word] if word else -1 # Process head info node.head_split = [character_to_index[character] for character in head] node.head_index = word_to_index[head] # Process ne info node.ne = ne if not node.parent or node.parent.span!=span: ne_count[ne] += 1 if ne != "NONE": pos_ne_count[pos] += 1 # Process span info node.span = span # Binarize children if len(tree.subtrees) > 2: side_child_pos = tree.subtrees[-1].label side_child_span = tree.subtrees[-1].span side_child_head = head_raw_data[(side_child_span, side_child_pos)][1] if side_child_head != head: sub_subtrees = tree.subtrees[:-1] else: sub_subtrees = tree.subtrees[1:] new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1]) new_tree = PSTree(label=pos, span=new_span, subtrees=sub_subtrees) new_tree.head = head if side_child_head != head: tree.subtrees = [new_tree, tree.subtrees[-1]] else: tree.subtrees = [tree.subtrees[0], new_tree] # Process children nodes = 1 for subtree in tree.subtrees: child = Node() node.add_child(child) child_nodes = construct_node(child, subtree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, pos_count, ne_count, pos_ne_count) nodes += child_nodes return nodes
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index): log("get_tree_data()...") """ Get tree structured data from CoNLL 2012 Stores into Node data structure """ tree_list = [] ner_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) for document in raw_data["auto"]: for part in raw_data["auto"][document]: ner_raw_data = defaultdict(lambda: {}) for k, v in raw_data["gold"][document][part]["ner"].iteritems(): ner_raw_data[k[0]][(k[1], k[2])] = v for index, parse in enumerate( raw_data["auto"][document][part]["parses"]): text_raw_data = raw_data["auto"][document][part]["text"][index] word_count += len(text_raw_data) if parse.subtrees[0].label == "NOPARSE": continue head_raw_data = raw_data["auto"][document][part]["heads"][ index] root_node = Node() nodes = construct_node(root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, pos_count, ne_count, pos_ne_count) root_node.nodes = nodes tree_list.append(root_node) ner_list.append(ner_raw_data[index]) log(" %d sentences\n" % len(tree_list)) return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index, word_to_index, pos_to_index, index_to_lexicon): log("get_tree_data()...") """ Get tree structured data from CoNLL-2003 Stores into Node data structure """ tree_pyramid_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for index, parse in enumerate(parse_list): text_raw_data = sentence_list[index] word_count += len(text_raw_data) span_to_node = {} head_raw_data = head_finder.collins_find_heads(parse) root_node = Node() nodes = construct_node(root_node, parse, ner_list[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) root_node.nodes = nodes root_node.tokens = len(text_raw_data) additional_node_list = create_dense_nodes(ner_list[index], text_raw_data, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) tree_pyramid_list.append((root_node, additional_node_list)) log(" %d sentences\n" % len(tree_pyramid_list)) return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[ 0]
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index): log("get_tree_data()...") """ Get tree structured data from CoNLL 2012 Stores into Node data structure """ tree_list = [] ner_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) for document in raw_data["auto"]: for part in raw_data["auto"][document]: ner_raw_data = defaultdict(lambda: {}) for k, v in raw_data["gold"][document][part]["ner"].iteritems(): ner_raw_data[k[0]][(k[1], k[2])] = v for index, parse in enumerate(raw_data["auto"][document][part]["parses"]): text_raw_data = raw_data["auto"][document][part]["text"][index] word_count += len(text_raw_data) if parse.subtrees[0].label == "NOPARSE": continue head_raw_data = raw_data["auto"][document][part]["heads"][index] root_node = Node() nodes = construct_node( root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, pos_count, ne_count, pos_ne_count) root_node.nodes = nodes tree_list.append(root_node) ner_list.append(ner_raw_data[index]) log(" %d sentences\n" % len(tree_list)) return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index, word_to_index, pos_to_index, index_to_lexicon): log("get_tree_data()...") """ Get tree structured data from CoNLL-2003 Stores into Node data structure """ tree_pyramid_list = [] word_count = 0 pos_count = defaultdict(lambda: 0) ne_count = defaultdict(lambda: 0) pos_ne_count = defaultdict(lambda: 0) lexicon_hits = [0] for index, parse in enumerate(parse_list): text_raw_data = sentence_list[index] word_count += len(text_raw_data) span_to_node = {} head_raw_data = head_finder.collins_find_heads(parse) root_node = Node() nodes = construct_node( root_node, parse, ner_list[index], head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) root_node.nodes = nodes root_node.tokens = len(text_raw_data) additional_node_list = create_dense_nodes( ner_list[index], text_raw_data, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) tree_pyramid_list.append((root_node, additional_node_list)) log(" %d sentences\n" % len(tree_pyramid_list)) return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0]
def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index): leaf = Node() leaf.word = word_list[index] leaf.pos = pos_list[index] leaf.span = (index, index + 1) leaf.head = leaf.word root = leaf for child_index in head_list[index]: child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list, child_index) new_root = Node() new_root.word = None new_root.pos = relation_list[child_index] if child_index < index: new_root.span = (child_root.span[0], root.span[1]) new_root.add_child(child_root) new_root.add_child(root) else: new_root.span = (root.span[0], child_root.span[1]) new_root.add_child(root) new_root.add_child(child_root) new_root.head = root.head root = new_root return root
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): node_list = [] max_dense_span = 3 # Start from bigram, since all unigrams are already covered by parses for span_length in range(2, 1 + max_dense_span): for span_start in range(0, 1 + len(text_raw_data) - span_length): span = (span_start, span_start + span_length) if span in span_to_node: continue pos = "NONE" ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # span, child # TODO: sibling node = Node() node_list.append(node) node.span = span span_to_node[span] = node node.child_list = [ span_to_node[(span[0], span[1] - 1)], span_to_node[(span[0] + 1, span[1])] ] # word, head, pos node.pos_index = pos_to_index[pos] pos_count[pos] += 1 node.word_split = [] node.word_index = -1 node.head_split = [] node.head_index = -1 # ne node.ne = ne if ne != "NONE": ne_count[ne] += 1 pos_ne_count[pos] += 1 # lexicon node.lexicon_hit = [0] * len(index_to_lexicon) hits = 0 for index, lexicon in index_to_lexicon.items(): if constituent in lexicon: node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits return node_list
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): pos = tree.label word = tree.word span = tree.span head = tree.head if hasattr(tree, "head") else head_raw_data[(span, pos)][1] ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # Process pos info node.pos_index = pos_to_index[pos] pos_count[pos] += 1 # Process word info node.word_split = [character_to_index[character] for character in word] if word else [] node.word_index = word_to_index[word] if word else -1 # Process head info node.head_split = [character_to_index[character] for character in head] #if head == "-LSB-": print(text_raw_data node.head_index = word_to_index[head] # Process ne info node.ne = ne if ne != "NONE": if not node.parent or node.parent.span != span: ne_count[ne] += 1 pos_ne_count[pos] += 1 # Process span info node.span = span span_to_node[span] = node # Process lexicon info node.lexicon_hit = [0] * len(index_to_lexicon) hits = 0 for index, lexicon in index_to_lexicon.items(): if constituent in lexicon: node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits # Binarize children if len(tree.subtrees) > 2: side_child_pos = tree.subtrees[-1].label side_child_span = tree.subtrees[-1].span side_child_head = head_raw_data[(side_child_span, side_child_pos)][1] if side_child_head != head: sub_subtrees = tree.subtrees[:-1] else: sub_subtrees = tree.subtrees[1:] new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1]) new_tree = pstree.PSTree(label=pos, span=new_span, subtrees=sub_subtrees) new_tree.head = head if side_child_head != head: tree.subtrees = [new_tree, tree.subtrees[-1]] else: tree.subtrees = [tree.subtrees[0], new_tree] # Process children nodes = 1 for subtree in tree.subtrees: child = Node() node.add_child(child) child_nodes = construct_node(child, subtree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, index_to_lexicon, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node) nodes += child_nodes return nodes
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, under_ne): pos = tree.label word = tree.word span = tree.span head = tree.head if hasattr(tree, "head") else head_raw_data[(span, pos)][1] ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # Process pos info node.pos_index = pos_to_index[pos] pos_count[pos] += 1 node.pos = pos #YOLO # Process word info node.word_split = [character_to_index[character] for character in word] if word else [] node.word_index = word_to_index[word] if word else -1 node.word = word if word else "" # YOLO # Process head info node.head_split = [character_to_index[character] for character in head] node.head_index = word_to_index[head] node.head = head # YOLO # Process ne info node.under_ne = under_ne node.ne = ne if ne != "NONE": under_ne = True if not node.parent or node.parent.span != span: ne_count[ne] += 1 pos_ne_count[pos] += 1 """ if hasattr(tree, "head"): print " ".join(text_raw_data) print " ".join(text_raw_data[span[0]:span[1]]) print ne print node.parent.head raw_input() """ # Process span info node.span = span node.span_length = span[1] - span[0] span_to_node[span] = node # Process lexicon info node.lexicon_hit = [0] * len(lexicon_list) hits = 0 for index, lexicon in enumerate(lexicon_list): if constituent in lexicon: lexicon[constituent] += 1 node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits # Binarize children if len(tree.subtrees) > 2: side_child_pos = tree.subtrees[-1].label side_child_span = tree.subtrees[-1].span side_child_head = head_raw_data[(side_child_span, side_child_pos)][1] if side_child_head != head: sub_subtrees = tree.subtrees[:-1] else: sub_subtrees = tree.subtrees[1:] new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1]) new_tree = PSTree(label=pos, span=new_span, subtrees=sub_subtrees) new_tree.head = head if side_child_head != head: tree.subtrees = [new_tree, tree.subtrees[-1]] else: tree.subtrees = [tree.subtrees[0], new_tree] # Process children nodes = 1 for subtree in tree.subtrees: child = Node() node.add_child(child) child_nodes = construct_node(child, subtree, ner_raw_data, head_raw_data, text_raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, under_ne) nodes += child_nodes return nodes
def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index): leaf = Node() leaf.word = word_list[index] leaf.pos = pos_list[index] leaf.span = (index, index+1) leaf.head = leaf.word root = leaf for child_index in head_list[index]: child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list, child_index) new_root = Node() new_root.word = None new_root.pos = relation_list[child_index] if child_index < index: new_root.span = (child_root.span[0], root.span[1]) new_root.add_child(child_root) new_root.add_child(root) else: new_root.span = (root.span[0], child_root.span[1]) new_root.add_child(root) new_root.add_child(child_root) new_root.head = root.head root = new_root return root
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list, pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node): node_list = [] max_dense_span = 3 # Start from bigram, since all unigrams are already covered by parses for span_length in range(2, 1+max_dense_span): for span_start in range(0, 1+len(text_raw_data)-span_length): span = (span_start, span_start+span_length) if span in span_to_node: continue pos = "NONE" ne = ner_raw_data[span] if span in ner_raw_data else "NONE" constituent = " ".join(text_raw_data[span[0]:span[1]]).lower() # span, child # TODO: sibling node = Node(family=1) node_list.append(node) node.span = span node.span_length = span_length span_to_node[span] = node node.child_list = [span_to_node[(span[0],span[1]-1)], span_to_node[(span[0]+1,span[1])]] # word, head, pos node.pos_index = pos_to_index[pos] pos_count[pos] += 1 node.word_split = [] node.word_index = -1 node.head_split = [] node.head_index = -1 # ne node.ne = ne if ne != "NONE": ne_count[ne] += 1 pos_ne_count[pos] += 1 # lexicon node.lexicon_hit = [0] * len(lexicon_list) hits = 0 for index, lexicon in enumerate(lexicon_list): if constituent in lexicon: lexicon[constituent] += 1 node.lexicon_hit[index] = 1 hits = 1 lexicon_hits[0] += hits return node_list