Esempio n. 1
0
def dependency_to_constituency(word_list, pos_list, head_list, relation_list,
                               index):
    leaf = Node()
    leaf.word = word_list[index]
    leaf.pos = pos_list[index]
    leaf.span = (index, index + 1)
    leaf.head = leaf.word

    root = leaf
    for child_index in head_list[index]:
        child_root = dependency_to_constituency(word_list, pos_list, head_list,
                                                relation_list, child_index)
        new_root = Node()
        new_root.word = None
        new_root.pos = relation_list[child_index]
        if child_index < index:
            new_root.span = (child_root.span[0], root.span[1])
            new_root.add_child(child_root)
            new_root.add_child(root)
        else:
            new_root.span = (root.span[0], child_root.span[1])
            new_root.add_child(root)
            new_root.add_child(child_root)
        new_root.head = root.head
        root = new_root

    return root
Esempio n. 2
0
def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index):
    leaf = Node()
    leaf.word = word_list[index]
    leaf.pos = pos_list[index]
    leaf.span = (index, index+1)
    leaf.head = leaf.word
    
    root = leaf
    for child_index in head_list[index]:
        child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list,
                            child_index)
        new_root = Node()
        new_root.word = None
        new_root.pos = relation_list[child_index]
        if child_index < index:
            new_root.span = (child_root.span[0], root.span[1])
            new_root.add_child(child_root)
            new_root.add_child(root)
        else:
            new_root.span = (root.span[0], child_root.span[1])
            new_root.add_child(root)
            new_root.add_child(child_root)
        new_root.head = root.head
        root = new_root
    
    return root
Esempio n. 3
0
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list,
                       pos_count, ne_count, pos_ne_count, lexicon_hits,
                       span_to_node):
    node_list = []
    max_dense_span = 3
    # Start from bigram, since all unigrams are already covered by parses
    for span_length in range(2, 1 + max_dense_span):
        for span_start in range(0, 1 + len(text_raw_data) - span_length):
            span = (span_start, span_start + span_length)
            if span in span_to_node: continue
            pos = "NONE"
            ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
            constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()

            # span, child
            # TODO: sibling
            node = Node(family=1)
            node_list.append(node)
            node.span = span
            node.span_length = span_length
            span_to_node[span] = node
            node.child_list = [
                span_to_node[(span[0], span[1] - 1)],
                span_to_node[(span[0] + 1, span[1])]
            ]

            # word, head, pos
            node.pos_index = pos_to_index[pos]
            pos_count[pos] += 1
            node.word_split = []
            node.word_index = -1
            node.head_split = []
            node.head_index = -1

            # ne
            node.ne = ne
            if ne != "NONE":
                ne_count[ne] += 1
                pos_ne_count[pos] += 1

            # lexicon
            node.lexicon_hit = [0] * len(lexicon_list)
            hits = 0
            for index, lexicon in enumerate(lexicon_list):
                if constituent in lexicon:
                    lexicon[constituent] += 1
                    node.lexicon_hit[index] = 1
                    hits = 1
            lexicon_hits[0] += hits

    return node_list
Esempio n. 4
0
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list,
        pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node):
    node_list = []
    max_dense_span = 3
    # Start from bigram, since all unigrams are already covered by parses
    for span_length in range(2, 1+max_dense_span):
        for span_start in range(0, 1+len(text_raw_data)-span_length):
            span = (span_start, span_start+span_length)
            if span in span_to_node: continue
            pos = "NONE"
            ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
            constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()
            
            # span, child
            # TODO: sibling
            node = Node(family=1)
            node_list.append(node)
            node.span = span
            node.span_length = span_length
            span_to_node[span] = node
            node.child_list = [span_to_node[(span[0],span[1]-1)], span_to_node[(span[0]+1,span[1])]]
            
            # word, head, pos
            node.pos_index = pos_to_index[pos]
            pos_count[pos] += 1
            node.word_split = []
            node.word_index = -1
            node.head_split = []
            node.head_index = -1
            
            # ne
            node.ne = ne
            if ne != "NONE":
                ne_count[ne] += 1
                pos_ne_count[pos] += 1
            
            # lexicon
            node.lexicon_hit = [0] * len(lexicon_list)
            hits = 0
            for index, lexicon in enumerate(lexicon_list):
                if constituent in lexicon:
                    lexicon[constituent] += 1
                    node.lexicon_hit[index] = 1
                    hits = 1
            lexicon_hits[0] += hits
    
    return node_list