Ejemplo n.º 1
0
def get_tree_data_new(raw_data, character_to_index, word_to_index,
                      pos_to_index, lexicon_list):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012

    Stores into Node data structure
    """
    tree_pyramid_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]

    for document in raw_data["gold"]:

        for part in raw_data["gold"][document]:
            ner_raw_data = defaultdict(lambda: {})

            for k, v in raw_data["gold"][document][part]["ner"].items():
                ner_raw_data[k[0]][(k[1], k[2])] = v

            for index, parse in enumerate(
                    raw_data["gold"][document][part]["parses"]):
                text_raw_data = raw_data["gold"][document][part]["text"][index]
                word_count += len(text_raw_data)

                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["gold"][document][part]["heads"][
                    index]

                root_node = Node()
                span_to_node = {}
                nodes = construct_node(root_node, parse, ner_raw_data[index],
                                       head_raw_data, text_raw_data,
                                       character_to_index, word_to_index,
                                       pos_to_index, lexicon_list, pos_count,
                                       ne_count, pos_ne_count, lexicon_hits,
                                       span_to_node, False)
                root_node.nodes = nodes
                root_node.text_raw_data = text_raw_data  # YOLO

                additional_node_list = []
                """
                additional_node_list = create_dense_nodes(
                    ner_raw_data[index], text_raw_data,
                    pos_to_index, lexicon_list,
                    pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
                """
                tree_pyramid_list.append((root_node, additional_node_list))
                ner_list.append(ner_raw_data[index])

    log(" %d sentences\n" % len(tree_pyramid_list))
    return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count,
            pos_ne_count, lexicon_hits[0])
Ejemplo n.º 2
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]
    
    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:
            
            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v
            
            for index, parse in enumerate(raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)
                
                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][index]
                
                root_node = Node()
                span_to_node = {}
                nodes = construct_node(
                   root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index, lexicon_list,
                   pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, False)
                root_node.nodes = nodes
                root_node.text_raw_data = text_raw_data #YOLO
                
                additional_node_list = []
                """
                additional_node_list = create_dense_nodes(
                    ner_raw_data[index], text_raw_data,
                    pos_to_index, lexicon_list,
                    pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
                """
                tree_pyramid_list.append((root_node, additional_node_list))
                ner_list.append(ner_raw_data[index])
                
    log(" %d sentences\n" % len(tree_pyramid_list))
    return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count, pos_ne_count,
        lexicon_hits[0])
Ejemplo n.º 3
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)

    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:

            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v

            for index, parse in enumerate(
                    raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)

                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][
                    index]

                root_node = Node()
                nodes = construct_node(root_node, parse, ner_raw_data[index],
                                       head_raw_data, text_raw_data,
                                       character_to_index, word_to_index,
                                       pos_to_index, pos_count, ne_count,
                                       pos_ne_count)
                root_node.nodes = nodes

                tree_list.append(root_node)
                ner_list.append(ner_raw_data[index])

    log(" %d sentences\n" % len(tree_list))
    return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
Ejemplo n.º 4
0
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index,
                  word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]

    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)

        root_node = Node()
        nodes = construct_node(root_node, parse, ner_list[index],
                               head_raw_data, text_raw_data,
                               character_to_index, word_to_index, pos_to_index,
                               index_to_lexicon, pos_count, ne_count,
                               pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)

        additional_node_list = create_dense_nodes(ner_list[index],
                                                  text_raw_data, pos_to_index,
                                                  index_to_lexicon, pos_count,
                                                  ne_count, pos_ne_count,
                                                  lexicon_hits, span_to_node)

        tree_pyramid_list.append((root_node, additional_node_list))

    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[
        0]
Ejemplo n.º 5
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    
    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:
            
            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v
            
            for index, parse in enumerate(raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)
                
                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][index]
                
                root_node = Node()
                nodes = construct_node(
                   root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index,
                   pos_count, ne_count, pos_ne_count)
                root_node.nodes = nodes
                
                tree_list.append(root_node)
                ner_list.append(ner_raw_data[index])
                
    log(" %d sentences\n" % len(tree_list))
    return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
Ejemplo n.º 6
0
def get_tree_data(sentence_list, parse_list, ner_list,
        character_to_index, word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]
    
    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)
        
        root_node = Node()
        nodes = construct_node(
           root_node, parse, ner_list[index], head_raw_data, text_raw_data,
           character_to_index, word_to_index, pos_to_index, index_to_lexicon,
           pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)
        
        additional_node_list = create_dense_nodes(
            ner_list[index], text_raw_data,
            pos_to_index, index_to_lexicon,
            pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        
        tree_pyramid_list.append((root_node, additional_node_list))
        
    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0]