def extract_clean_lexicon(): lexicon_list = [] print "\nReading raw lexicons..." for meta in lexicon_meta_list: lexicon_list.append( read_list_file(meta["raw"], encoding=meta["encoding"])[1]) print "-" * 50 + "\n ne phrases shortest\n" + "-" * 50 for index, lexicon in enumerate(lexicon_list): for phrase in lexicon: lexicon[phrase] = [0., 0.] shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase)) print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase) log("\nReading training data...") data_split_list = ["train", "validate"] sentence_data = {} ner_data = {} parse_data = {} for split in data_split_list: sentence_data[split], ner_data[split] = extract_ner(split) dependency_file = os.path.join(dataset, split_dependency[split]) dependency_parse_list = dependency_utils.read_conllu(dependency_file) parse_data[split] = [ dependency_utils.dependency_to_constituency(*parse) for parse in dependency_parse_list ] log(" done\n") log("\nCleaning lexicon by training data...") for split in data_split_list: for index, parse in enumerate(parse_data[split]): span_set = set() traverse_tree(parse, ner_data[split][index], sentence_data[split][index], lexicon_list, span_set) traverse_pyramid(ner_data[split][index], sentence_data[split][index], lexicon_list, span_set) log(" done\n") print "-" * 50 + "\n ne phrases shortest\n" + "-" * 50 for index, lexicon in enumerate(lexicon_list): for phrase, count in lexicon.items(): if count[0] > 0 and count[1] / count[0] < 0.1: del lexicon[phrase] shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase)) print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase) for index, lexicon in enumerate(lexicon_list): meta = lexicon_meta_list[index] with codecs.open(meta["clean"], "w", encoding=meta["encoding"]) as f: for phrase in sorted(lexicon.iterkeys()): f.write("%s\n" % phrase) return
def prepare_dataset(): ne_set = set() word_set = set() character_set = set() pos_set = set() for split in split_raw: sentence_list, ner_list = extract_ner(split) # Procecss raw NER for ner in ner_list: for ne in ner.itervalues(): ne_set.add(ne) # Procecss raw sentences and store into conllu format sentence_file = os.path.join(dataset, split_sentence[split]) with open(sentence_file, "w") as f: for sentence in sentence_list: f.write("#" + " ".join(sentence) + "\n") for i, word in enumerate(sentence): f.write("%d\t" % (i + 1) + word + "\t_" * 8 + "\n") word_set.add(word) for character in word: character_set.add(character) f.write("\n") # Generate dependency parses subprocess.call([parse_script, split], cwd=syntaxnet_path) # Transform dependency parses to constituency parses dependency_file = os.path.join(dataset, split_dependency[split]) dependency_list = dependency_utils.read_conllu(dependency_file) for dependency_parse in dependency_list: constituency_parse = dependency_utils.dependency_to_constituency( *dependency_parse) extract_pos_from_tree(constituency_parse, pos_set) with open(ne_file, "w") as f: for ne in sorted(ne_set): f.write(ne + '\n') with open(word_file, "w") as f: for word in sorted(word_set): f.write(word + '\n') with open(character_file, "w") as f: for character in sorted(character_set): f.write(character + '\n') with open(pos_file, "w") as f: for pos in sorted(pos_set): f.write(pos + '\n') return
def extract_clean_lexicon(): lexicon_list = [] print "\nReading raw lexicons..." for meta in lexicon_meta_list: lexicon_list.append(read_list_file(meta["raw"], encoding=meta["encoding"])[1]) print "-"*50 + "\n ne phrases shortest\n" + "-"*50 for index, lexicon in enumerate(lexicon_list): for phrase in lexicon: lexicon[phrase] = [0.,0.] shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase)) print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase) log("\nReading training data...") data_split_list = ["train", "validate"] sentence_data = {} ner_data = {} parse_data = {} for split in data_split_list: sentence_data[split], ner_data[split] = extract_ner(split) dependency_file = os.path.join(dataset, split_dependency[split]) dependency_parse_list = dependency_utils.read_conllu(dependency_file) parse_data[split] = [dependency_utils.dependency_to_constituency(*parse) for parse in dependency_parse_list] log(" done\n") log("\nCleaning lexicon by training data...") for split in data_split_list: for index, parse in enumerate(parse_data[split]): span_set = set() traverse_tree(parse, ner_data[split][index], sentence_data[split][index], lexicon_list, span_set) traverse_pyramid(ner_data[split][index], sentence_data[split][index], lexicon_list, span_set) log(" done\n") print "-"*50 + "\n ne phrases shortest\n" + "-"*50 for index, lexicon in enumerate(lexicon_list): for phrase, count in lexicon.items(): if count[0]>0 and count[1]/count[0]<0.1: del lexicon[phrase] shortest_phrase = min(lexicon.iterkeys(), key=lambda phrase: len(phrase)) print "%5s %8d %s" % (lexicon_meta_list[index]["ne"], len(lexicon), shortest_phrase) for index, lexicon in enumerate(lexicon_list): meta = lexicon_meta_list[index] with codecs.open(meta["clean"], "w", encoding=meta["encoding"]) as f: for phrase in sorted(lexicon.iterkeys()): f.write("%s\n" % phrase) return
def prepare_dataset(): ne_set = set() word_set = set() character_set = set() pos_set = set() for split in split_raw: sentence_list, ner_list = extract_ner(split) # Procecss raw NER for ner in ner_list: for ne in ner.itervalues(): ne_set.add(ne) # Procecss raw sentences and store into conllu format sentence_file = os.path.join(dataset, split_sentence[split]) with open(sentence_file, "w") as f: for sentence in sentence_list: f.write("#" + " ".join(sentence) + "\n") for i, word in enumerate(sentence): f.write("%d\t"%(i+1) + word + "\t_"*8 + "\n") word_set.add(word) for character in word: character_set.add(character) f.write("\n") # Generate dependency parses subprocess.call([parse_script, split], cwd=syntaxnet_path) # Transform dependency parses to constituency parses dependency_file = os.path.join(dataset, split_dependency[split]) dependency_list = dependency_utils.read_conllu(dependency_file) for dependency_parse in dependency_list: constituency_parse = dependency_utils.dependency_to_constituency(*dependency_parse) extract_pos_from_tree(constituency_parse, pos_set) with open(ne_file, "w") as f: for ne in sorted(ne_set): f.write(ne + '\n') with open(word_file, "w") as f: for word in sorted(word_set): f.write(word + '\n') with open(character_file, "w") as f: for character in sorted(character_set): f.write(character + '\n') with open(pos_file, "w") as f: for pos in sorted(pos_set): f.write(pos + '\n') return
def read_dataset(data_split_list = ["train", "validate", "test"]): # Read all raw data sentence_data = {} ner_data = {} parse_data = {} for split in data_split_list: sentence_data[split], ner_data[split] = extract_ner(split) dependency_file = os.path.join(dataset, split_dependency[split]) dependency_parse_list = dependency_utils.read_conllu(dependency_file) parse_data[split] = [dependency_utils.dependency_to_constituency(*parse) for parse in dependency_parse_list] # Read lists of annotations character_list, character_to_index = read_list_file(character_file) word_list, word_to_index = read_list_file(word_file) pos_list, pos_to_index = read_list_file(pos_file) ne_list, ne_to_index = read_list_file(ne_file) pos_to_index["NONE"] = len(pos_to_index) # Read lexicon lexicon_list = [] for meta in lexicon_meta_list: lexicon_list.append(read_list_file(meta["raw"], encoding=meta["encoding"])[1]) #lexicon_list.append(read_list_file(meta["clean"], encoding=meta["encoding"])[1]) for lexicon in lexicon_list: for phrase in lexicon: lexicon[phrase] = 0 # Build a tree structure for each sentence data = {} word_count = {} pos_count = {} ne_count = {} pos_ne_count = {} lexicon_hits = {} for split in data_split_list: (tree_pyramid_list, word_count[split], pos_count[split], ne_count[split], pos_ne_count[split], lexicon_hits[split]) = get_tree_data( sentence_data[split], parse_data[split], ner_data[split], character_to_index, word_to_index, pos_to_index, lexicon_list) data[split] = {"tree_pyramid_list": tree_pyramid_list, "ner_list": ner_data[split]} for index, lexicon in enumerate(lexicon_list): with codecs.open("tmp_%d.txt" % index, "w", encoding="utf8") as f: for name, count in sorted(lexicon.iteritems(), key=lambda x: (-x[1], x[0])): if count == 0: break f.write("%9d %s\n" % (count, name)) # Show statistics of each data split print "-" * 80 print "%10s%10s%9s%9s%7s%12s%13s" % ("split", "sentence", "token", "node", "NE", "spanned_NE", "lexicon_hit") print "-" * 80 for split in data_split_list: print "%10s%10d%9d%9d%7d%12d%13d" % (split, len(data[split]["tree_pyramid_list"]), word_count[split], sum(pos_count[split].itervalues()), sum(len(ner) for ner in data[split]["ner_list"]), sum(ne_count[split].itervalues()), lexicon_hits[split]) # Show POS distribution total_pos_count = defaultdict(lambda: 0) for split in data_split_list: for pos in pos_count[split]: total_pos_count[pos] += pos_count[split][pos] nodes = sum(total_pos_count.itervalues()) print "\nTotal %d nodes" % nodes print "-"*80 + "\n POS count ratio\n" + "-"*80 for pos, count in sorted(total_pos_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]: print "%6s %7d %5.1f%%" % (pos, count, count*100./nodes) # Show NE distribution in [train, validate] total_ne_count = defaultdict(lambda: 0) for split in data_split_list: if split == "test": continue for ne in ne_count[split]: total_ne_count[ne] += ne_count[split][ne] nes = sum(total_ne_count.itervalues()) print "\nTotal %d spanned named entities in [train, validate]" % nes print "-"*80 + "\n NE count ratio\n" + "-"*80 for ne, count in sorted(total_ne_count.iteritems(), key=lambda x: x[1], reverse=True): print "%12s %6d %5.1f%%" % (ne, count, count*100./nes) # Show POS-NE distribution in [train, validate] total_pos_ne_count = defaultdict(lambda: 0) for split in data_split_list: if split == "test": continue for pos in pos_ne_count[split]: total_pos_ne_count[pos] += pos_ne_count[split][pos] print "-"*80 + "\n POS NE total ratio\n" + "-"*80 for pos, count in sorted(total_pos_ne_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]: total = total_pos_count[pos] print "%6s %6d %7d %5.1f%%" % (pos, count, total, count*100./total) # Compute the mapping to labels ne_to_index["NONE"] = len(ne_to_index) # Add label to nodes for split in data_split_list: for tree, pyramid in data[split]["tree_pyramid_list"]: label_tree_data(tree, pos_to_index, ne_to_index) for node in pyramid: node.y = ne_to_index[node.ne] return (data, word_list, ne_list, len(character_to_index), len(pos_to_index), len(ne_to_index), len(lexicon_list))
def read_dataset(data_split_list=["train", "validate", "test"]): # Read all raw data sentence_data = {} ner_data = {} parse_data = {} for split in data_split_list: sentence_data[split], ner_data[split] = extract_ner(split) dependency_file = os.path.join(dataset, split_dependency[split]) dependency_parse_list = dependency_utils.read_conllu(dependency_file) parse_data[split] = [ dependency_utils.dependency_to_constituency(*parse) for parse in dependency_parse_list ] # Read lists of annotations character_list, character_to_index = read_list_file(character_file) word_list, word_to_index = read_list_file(word_file) pos_list, pos_to_index = read_list_file(pos_file) ne_list, ne_to_index = read_list_file(ne_file) pos_to_index["NONE"] = len(pos_to_index) # Read lexicon lexicon_list = [] for meta in lexicon_meta_list: lexicon_list.append( read_list_file(meta["raw"], encoding=meta["encoding"])[1]) #lexicon_list.append(read_list_file(meta["clean"], encoding=meta["encoding"])[1]) for lexicon in lexicon_list: for phrase in lexicon: lexicon[phrase] = 0 # Build a tree structure for each sentence data = {} word_count = {} pos_count = {} ne_count = {} pos_ne_count = {} lexicon_hits = {} for split in data_split_list: (tree_pyramid_list, word_count[split], pos_count[split], ne_count[split], pos_ne_count[split], lexicon_hits[split]) = get_tree_data( sentence_data[split], parse_data[split], ner_data[split], character_to_index, word_to_index, pos_to_index, lexicon_list) data[split] = { "tree_pyramid_list": tree_pyramid_list, "ner_list": ner_data[split] } for index, lexicon in enumerate(lexicon_list): with codecs.open("tmp_%d.txt" % index, "w", encoding="utf8") as f: for name, count in sorted(lexicon.iteritems(), key=lambda x: (-x[1], x[0])): if count == 0: break f.write("%9d %s\n" % (count, name)) # Show statistics of each data split print "-" * 80 print "%10s%10s%9s%9s%7s%12s%13s" % ("split", "sentence", "token", "node", "NE", "spanned_NE", "lexicon_hit") print "-" * 80 for split in data_split_list: print "%10s%10d%9d%9d%7d%12d%13d" % ( split, len(data[split]["tree_pyramid_list"]), word_count[split], sum(pos_count[split].itervalues()), sum(len(ner) for ner in data[split]["ner_list"]), sum(ne_count[split].itervalues()), lexicon_hits[split]) # Show POS distribution total_pos_count = defaultdict(lambda: 0) for split in data_split_list: for pos in pos_count[split]: total_pos_count[pos] += pos_count[split][pos] nodes = sum(total_pos_count.itervalues()) print "\nTotal %d nodes" % nodes print "-" * 80 + "\n POS count ratio\n" + "-" * 80 for pos, count in sorted(total_pos_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]: print "%6s %7d %5.1f%%" % (pos, count, count * 100. / nodes) # Show NE distribution in [train, validate] total_ne_count = defaultdict(lambda: 0) for split in data_split_list: if split == "test": continue for ne in ne_count[split]: total_ne_count[ne] += ne_count[split][ne] nes = sum(total_ne_count.itervalues()) print "\nTotal %d spanned named entities in [train, validate]" % nes print "-" * 80 + "\n NE count ratio\n" + "-" * 80 for ne, count in sorted(total_ne_count.iteritems(), key=lambda x: x[1], reverse=True): print "%12s %6d %5.1f%%" % (ne, count, count * 100. / nes) # Show POS-NE distribution in [train, validate] total_pos_ne_count = defaultdict(lambda: 0) for split in data_split_list: if split == "test": continue for pos in pos_ne_count[split]: total_pos_ne_count[pos] += pos_ne_count[split][pos] print "-" * 80 + "\n POS NE total ratio\n" + "-" * 80 for pos, count in sorted(total_pos_ne_count.iteritems(), key=lambda x: x[1], reverse=True)[:10]: total = total_pos_count[pos] print "%6s %6d %7d %5.1f%%" % (pos, count, total, count * 100. / total) # Compute the mapping to labels ne_to_index["NONE"] = len(ne_to_index) # Add label to nodes for split in data_split_list: for tree, pyramid in data[split]["tree_pyramid_list"]: label_tree_data(tree, pos_to_index, ne_to_index) for node in pyramid: node.y = ne_to_index[node.ne] return (data, word_list, ne_list, len(character_to_index), len(pos_to_index), len(ne_to_index), len(lexicon_list))