Beispiel #1
0
def extract_clean_lexicon():
    lexicon_list = []

    print "\nReading raw lexicons..."
    for meta in lexicon_meta_list:
        lexicon_list.append(
            read_list_file(meta["raw"], encoding=meta["encoding"])[1])

    print "-" * 50 + "\n   ne  phrases shortest\n" + "-" * 50
    for index, lexicon in enumerate(lexicon_list):
        for phrase in lexicon:
            lexicon[phrase] = [0., 0.]
        shortest_phrase = min(lexicon.iterkeys(),
                              key=lambda phrase: len(phrase))
        print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon),
                              shortest_phrase)

    log("\nReading training data...")
    data_split_list = ["train", "validate"]
    sentence_data = {}
    ner_data = {}
    parse_data = {}
    for split in data_split_list:
        sentence_data[split], ner_data[split] = extract_ner(split)

        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_parse_list = dependency_utils.read_conllu(dependency_file)
        parse_data[split] = [
            dependency_utils.dependency_to_constituency(*parse)
            for parse in dependency_parse_list
        ]
    log(" done\n")

    log("\nCleaning lexicon by training data...")
    for split in data_split_list:
        for index, parse in enumerate(parse_data[split]):
            span_set = set()
            traverse_tree(parse, ner_data[split][index],
                          sentence_data[split][index], lexicon_list, span_set)
            traverse_pyramid(ner_data[split][index],
                             sentence_data[split][index], lexicon_list,
                             span_set)
    log(" done\n")

    print "-" * 50 + "\n   ne  phrases shortest\n" + "-" * 50
    for index, lexicon in enumerate(lexicon_list):
        for phrase, count in lexicon.items():
            if count[0] > 0 and count[1] / count[0] < 0.1:
                del lexicon[phrase]
        shortest_phrase = min(lexicon.iterkeys(),
                              key=lambda phrase: len(phrase))
        print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon),
                              shortest_phrase)

    for index, lexicon in enumerate(lexicon_list):
        meta = lexicon_meta_list[index]
        with codecs.open(meta["clean"], "w", encoding=meta["encoding"]) as f:
            for phrase in sorted(lexicon.iterkeys()):
                f.write("%s\n" % phrase)
    return
Beispiel #2
0
def prepare_dataset():
    ne_set = set()
    word_set = set()
    character_set = set()
    pos_set = set()

    for split in split_raw:
        sentence_list, ner_list = extract_ner(split)

        # Procecss raw NER
        for ner in ner_list:
            for ne in ner.itervalues():
                ne_set.add(ne)

        # Procecss raw sentences and store into conllu format
        sentence_file = os.path.join(dataset, split_sentence[split])
        with open(sentence_file, "w") as f:
            for sentence in sentence_list:
                f.write("#" + " ".join(sentence) + "\n")
                for i, word in enumerate(sentence):
                    f.write("%d\t" % (i + 1) + word + "\t_" * 8 + "\n")
                    word_set.add(word)
                    for character in word:
                        character_set.add(character)
                f.write("\n")

        # Generate dependency parses
        subprocess.call([parse_script, split], cwd=syntaxnet_path)

        # Transform dependency parses to constituency parses
        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_list = dependency_utils.read_conllu(dependency_file)
        for dependency_parse in dependency_list:
            constituency_parse = dependency_utils.dependency_to_constituency(
                *dependency_parse)
            extract_pos_from_tree(constituency_parse, pos_set)

    with open(ne_file, "w") as f:
        for ne in sorted(ne_set):
            f.write(ne + '\n')

    with open(word_file, "w") as f:
        for word in sorted(word_set):
            f.write(word + '\n')

    with open(character_file, "w") as f:
        for character in sorted(character_set):
            f.write(character + '\n')

    with open(pos_file, "w") as f:
        for pos in sorted(pos_set):
            f.write(pos + '\n')
    return
Beispiel #3
0
def extract_clean_lexicon():
    lexicon_list = []
    
    print "\nReading raw lexicons..."
    for meta in lexicon_meta_list:
        lexicon_list.append(read_list_file(meta["raw"], encoding=meta["encoding"])[1])
    
    print "-"*50 + "\n   ne  phrases shortest\n" + "-"*50
    for index, lexicon in enumerate(lexicon_list):
        for phrase in lexicon:
            lexicon[phrase] = [0.,0.]
        shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase))
        print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase)
    
    log("\nReading training data...")
    data_split_list = ["train", "validate"]
    sentence_data = {}
    ner_data = {}
    parse_data = {}
    for split in data_split_list:
        sentence_data[split], ner_data[split] = extract_ner(split)
        
        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_parse_list = dependency_utils.read_conllu(dependency_file)
        parse_data[split] = [dependency_utils.dependency_to_constituency(*parse)
            for parse in dependency_parse_list]
    log(" done\n")
        
    log("\nCleaning lexicon by training data...")
    for split in data_split_list:
        for index, parse in enumerate(parse_data[split]):
            span_set = set()
            traverse_tree(parse, ner_data[split][index], sentence_data[split][index], lexicon_list,
                span_set)
            traverse_pyramid(ner_data[split][index], sentence_data[split][index], lexicon_list,
                span_set)
    log(" done\n")
    
    print "-"*50 + "\n   ne  phrases shortest\n" + "-"*50
    for index, lexicon in enumerate(lexicon_list):
        for phrase, count in lexicon.items():
            if count[0]>0 and count[1]/count[0]<0.1:
                del lexicon[phrase]
        shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase))
        print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase)
        
    for index, lexicon in enumerate(lexicon_list):
        meta = lexicon_meta_list[index]
        with codecs.open(meta["clean"], "w", encoding=meta["encoding"]) as f:
            for phrase in sorted(lexicon.iterkeys()):
                f.write("%s\n" % phrase)
    return
Beispiel #4
0
def prepare_dataset():
    ne_set = set()
    word_set = set()
    character_set = set()
    pos_set = set()
    
    for split in split_raw:
        sentence_list, ner_list = extract_ner(split)
        
        # Procecss raw NER
        for ner in ner_list:
            for ne in ner.itervalues():
                ne_set.add(ne)
        
        # Procecss raw sentences and store into conllu format
        sentence_file = os.path.join(dataset, split_sentence[split])
        with open(sentence_file, "w") as f:
            for sentence in sentence_list:
                f.write("#" + " ".join(sentence) + "\n")
                for i, word in enumerate(sentence):
                    f.write("%d\t"%(i+1) + word + "\t_"*8 + "\n")
                    word_set.add(word)
                    for character in word:
                        character_set.add(character)
                f.write("\n")
        
        # Generate dependency parses
        subprocess.call([parse_script, split], cwd=syntaxnet_path)
        
        # Transform dependency parses to constituency parses
        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_list = dependency_utils.read_conllu(dependency_file)
        for dependency_parse in dependency_list:
            constituency_parse = dependency_utils.dependency_to_constituency(*dependency_parse)
            extract_pos_from_tree(constituency_parse, pos_set)
            
    with open(ne_file, "w") as f:
        for ne in sorted(ne_set):
            f.write(ne + '\n')
    
    with open(word_file, "w") as f:
        for word in sorted(word_set):
            f.write(word + '\n')
    
    with open(character_file, "w") as f:
        for character in sorted(character_set):
            f.write(character + '\n')
    
    with open(pos_file, "w") as f:
        for pos in sorted(pos_set):
            f.write(pos + '\n')
    return
Beispiel #5
0
def read_dataset(data_split_list = ["train", "validate", "test"]):
    # Read all raw data
    sentence_data = {}
    ner_data = {}
    parse_data = {}
    for split in data_split_list:
        sentence_data[split], ner_data[split] = extract_ner(split)
        
        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_parse_list = dependency_utils.read_conllu(dependency_file)
        parse_data[split] = [dependency_utils.dependency_to_constituency(*parse)
            for parse in dependency_parse_list]
        
    # Read lists of annotations
    character_list, character_to_index = read_list_file(character_file)
    word_list, word_to_index = read_list_file(word_file)
    pos_list, pos_to_index = read_list_file(pos_file)
    ne_list, ne_to_index = read_list_file(ne_file)
    
    pos_to_index["NONE"] = len(pos_to_index)
    
    # Read lexicon
    lexicon_list = []
    for meta in lexicon_meta_list:
        lexicon_list.append(read_list_file(meta["raw"], encoding=meta["encoding"])[1])
        #lexicon_list.append(read_list_file(meta["clean"], encoding=meta["encoding"])[1])
    
    for lexicon in lexicon_list:
        for phrase in lexicon:
            lexicon[phrase] = 0
    
    # Build a tree structure for each sentence
    data = {}
    word_count = {}
    pos_count = {}
    ne_count = {}
    pos_ne_count = {}
    lexicon_hits = {}
    for split in data_split_list:
        (tree_pyramid_list,
            word_count[split], pos_count[split], ne_count[split], pos_ne_count[split],
            lexicon_hits[split]) = get_tree_data(
                sentence_data[split], parse_data[split], ner_data[split],
                character_to_index, word_to_index, pos_to_index, lexicon_list)
        data[split] = {"tree_pyramid_list": tree_pyramid_list, "ner_list": ner_data[split]}
    
    for index, lexicon in enumerate(lexicon_list):
        with codecs.open("tmp_%d.txt" % index, "w", encoding="utf8") as f:
            for name, count in sorted(lexicon.iteritems(), key=lambda x: (-x[1], x[0])):
                if count == 0: break
                f.write("%9d %s\n" % (count, name))
    
    # Show statistics of each data split 
    print "-" * 80
    print "%10s%10s%9s%9s%7s%12s%13s" % ("split", "sentence", "token", "node", "NE", "spanned_NE",
        "lexicon_hit")
    print "-" * 80
    for split in data_split_list:
        print "%10s%10d%9d%9d%7d%12d%13d" % (split,
            len(data[split]["tree_pyramid_list"]),
            word_count[split],
            sum(pos_count[split].itervalues()),
            sum(len(ner) for ner in data[split]["ner_list"]),
            sum(ne_count[split].itervalues()),
            lexicon_hits[split])
    
    # Show POS distribution
    total_pos_count = defaultdict(lambda: 0)
    for split in data_split_list:
        for pos in pos_count[split]:
            total_pos_count[pos] += pos_count[split][pos]
    nodes = sum(total_pos_count.itervalues())
    print "\nTotal %d nodes" % nodes
    print "-"*80 + "\n   POS   count  ratio\n" + "-"*80
    for pos, count in sorted(total_pos_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]:
        print "%6s %7d %5.1f%%" % (pos, count, count*100./nodes)
    
    # Show NE distribution in [train, validate]
    total_ne_count = defaultdict(lambda: 0)
    for split in data_split_list:
        if split == "test": continue
        for ne in ne_count[split]:
            total_ne_count[ne] += ne_count[split][ne]
    nes = sum(total_ne_count.itervalues())
    print "\nTotal %d spanned named entities in [train, validate]" % nes
    print "-"*80 + "\n          NE  count  ratio\n" + "-"*80
    for ne, count in sorted(total_ne_count.iteritems(), key=lambda x: x[1], reverse=True):
        print "%12s %6d %5.1f%%" % (ne, count, count*100./nes)
    
    # Show POS-NE distribution in [train, validate]
    total_pos_ne_count = defaultdict(lambda: 0)
    for split in data_split_list:
        if split == "test": continue
        for pos in pos_ne_count[split]:
            total_pos_ne_count[pos] += pos_ne_count[split][pos]
    print "-"*80 + "\n   POS     NE   total  ratio\n" + "-"*80
    for pos, count in sorted(total_pos_ne_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]:
        total = total_pos_count[pos]
        print "%6s %6d %7d %5.1f%%" % (pos, count, total, count*100./total)
    
    # Compute the mapping to labels
    ne_to_index["NONE"] = len(ne_to_index)
    
    # Add label to nodes
    for split in data_split_list:
        for tree, pyramid in data[split]["tree_pyramid_list"]:
            label_tree_data(tree, pos_to_index, ne_to_index)
            for node in pyramid:
                node.y = ne_to_index[node.ne]
    
    return (data, word_list, ne_list,
            len(character_to_index), len(pos_to_index), len(ne_to_index), len(lexicon_list))
Beispiel #6
0
def read_dataset(data_split_list=["train", "validate", "test"]):
    # Read all raw data
    sentence_data = {}
    ner_data = {}
    parse_data = {}
    for split in data_split_list:
        sentence_data[split], ner_data[split] = extract_ner(split)

        dependency_file = os.path.join(dataset, split_dependency[split])
        dependency_parse_list = dependency_utils.read_conllu(dependency_file)
        parse_data[split] = [
            dependency_utils.dependency_to_constituency(*parse)
            for parse in dependency_parse_list
        ]

    # Read lists of annotations
    character_list, character_to_index = read_list_file(character_file)
    word_list, word_to_index = read_list_file(word_file)
    pos_list, pos_to_index = read_list_file(pos_file)
    ne_list, ne_to_index = read_list_file(ne_file)

    pos_to_index["NONE"] = len(pos_to_index)

    # Read lexicon
    lexicon_list = []
    for meta in lexicon_meta_list:
        lexicon_list.append(
            read_list_file(meta["raw"], encoding=meta["encoding"])[1])
        #lexicon_list.append(read_list_file(meta["clean"], encoding=meta["encoding"])[1])

    for lexicon in lexicon_list:
        for phrase in lexicon:
            lexicon[phrase] = 0

    # Build a tree structure for each sentence
    data = {}
    word_count = {}
    pos_count = {}
    ne_count = {}
    pos_ne_count = {}
    lexicon_hits = {}
    for split in data_split_list:
        (tree_pyramid_list, word_count[split], pos_count[split],
         ne_count[split],
         pos_ne_count[split], lexicon_hits[split]) = get_tree_data(
             sentence_data[split], parse_data[split], ner_data[split],
             character_to_index, word_to_index, pos_to_index, lexicon_list)
        data[split] = {
            "tree_pyramid_list": tree_pyramid_list,
            "ner_list": ner_data[split]
        }

    for index, lexicon in enumerate(lexicon_list):
        with codecs.open("tmp_%d.txt" % index, "w", encoding="utf8") as f:
            for name, count in sorted(lexicon.iteritems(),
                                      key=lambda x: (-x[1], x[0])):
                if count == 0: break
                f.write("%9d %s\n" % (count, name))

    # Show statistics of each data split
    print "-" * 80
    print "%10s%10s%9s%9s%7s%12s%13s" % ("split", "sentence", "token", "node",
                                         "NE", "spanned_NE", "lexicon_hit")
    print "-" * 80
    for split in data_split_list:
        print "%10s%10d%9d%9d%7d%12d%13d" % (
            split, len(data[split]["tree_pyramid_list"]), word_count[split],
            sum(pos_count[split].itervalues()),
            sum(len(ner) for ner in data[split]["ner_list"]),
            sum(ne_count[split].itervalues()), lexicon_hits[split])

    # Show POS distribution
    total_pos_count = defaultdict(lambda: 0)
    for split in data_split_list:
        for pos in pos_count[split]:
            total_pos_count[pos] += pos_count[split][pos]
    nodes = sum(total_pos_count.itervalues())
    print "\nTotal %d nodes" % nodes
    print "-" * 80 + "\n   POS   count  ratio\n" + "-" * 80
    for pos, count in sorted(total_pos_count.iteritems(),
                             key=lambda x: x[1],
                             reverse=True)[:10]:
        print "%6s %7d %5.1f%%" % (pos, count, count * 100. / nodes)

    # Show NE distribution in [train, validate]
    total_ne_count = defaultdict(lambda: 0)
    for split in data_split_list:
        if split == "test": continue
        for ne in ne_count[split]:
            total_ne_count[ne] += ne_count[split][ne]
    nes = sum(total_ne_count.itervalues())
    print "\nTotal %d spanned named entities in [train, validate]" % nes
    print "-" * 80 + "\n          NE  count  ratio\n" + "-" * 80
    for ne, count in sorted(total_ne_count.iteritems(),
                            key=lambda x: x[1],
                            reverse=True):
        print "%12s %6d %5.1f%%" % (ne, count, count * 100. / nes)

    # Show POS-NE distribution in [train, validate]
    total_pos_ne_count = defaultdict(lambda: 0)
    for split in data_split_list:
        if split == "test": continue
        for pos in pos_ne_count[split]:
            total_pos_ne_count[pos] += pos_ne_count[split][pos]
    print "-" * 80 + "\n   POS     NE   total  ratio\n" + "-" * 80
    for pos, count in sorted(total_pos_ne_count.iteritems(),
                             key=lambda x: x[1],
                             reverse=True)[:10]:
        total = total_pos_count[pos]
        print "%6s %6d %7d %5.1f%%" % (pos, count, total, count * 100. / total)

    # Compute the mapping to labels
    ne_to_index["NONE"] = len(ne_to_index)

    # Add label to nodes
    for split in data_split_list:
        for tree, pyramid in data[split]["tree_pyramid_list"]:
            label_tree_data(tree, pos_to_index, ne_to_index)
            for node in pyramid:
                node.y = ne_to_index[node.ne]

    return (data, word_list, ne_list, len(character_to_index),
            len(pos_to_index), len(ne_to_index), len(lexicon_list))