class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 230 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "BMES" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = False self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 50 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) # 获取label label = pairs[-1] # 安装出现顺序添加 self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False # 判断是否属于BIO,BMES,BIOES其中一�? for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: # 如果有S则为BMES或BIOES self.tagScheme = "BMES" else: # 没有则为BIO self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): # build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_radical_pretrain_emb(self, emb_path): print "build radical pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_radical_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): # content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" ## "LSTM"/"CNN" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_average_batch_loss = False self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 50 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyper iteration: %s" % (self.HP_iteration)) print(" Hyper batch size: %s" % (self.HP_batch_size)) print(" Hyper average batch: %s" % (self.HP_average_batch_loss)) print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def extend_word_char_alphabet(self, input_file_list): old_word_size = self.word_alphabet_size old_char_size = self.char_alphabet_size for input_file in input_file_list: in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() print("Extend word/char alphabet finished!") print(" old word:%s -> new word:%s" % (old_word_size, self.word_alphabet_size)) print(" old char:%s -> new char:%s" % (old_char_size, self.char_alphabet_size)) for input_file in input_file_list: print(" from file:%s" % (input_file)) def build_alphabet(self, input_file): in_lines_string = open(input_file + ".string.txt", 'r').readlines() in_lines_label = open(input_file + ".label.txt", 'r').readlines() for line_string, line_label in zip(in_lines_string, in_lines_label): print(line_label) print(line_string) line_label = line_label[:-1].split(',') line_string = line_string[:-1] assert len(line_label) == len(line_string) for i in range(len(line_label)): self.label_alphabet.add(line_label[i]) self.word_alphabet.add(line_string[i]) self.char_alphabet.add("*") self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_char_pretrain_emb(self, emb_path): self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True # self.punctuation_filter = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): addLogSectionMark("DATA SUMMARY") print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) # print(" Punctuation filter: %s" % (self.punctuation_filter)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) logger.info(" Tag scheme: %s" % (self.tagScheme)) logger.info(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) logger.info(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) logger.info(" Number normalized: %s" % (self.number_normalized)) logger.info(" Use bigram: %s" % (self.use_bigram)) logger.info(" Word alphabet size: %s" % (self.word_alphabet_size)) logger.info(" Biword alphabet size: %s" % (self.biword_alphabet_size)) logger.info(" Char alphabet size: %s" % (self.char_alphabet_size)) logger.info(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) logger.info(" Label alphabet size: %s" % (self.label_alphabet_size)) logger.info(" Word embedding size: %s" % (self.word_emb_dim)) logger.info(" Biword embedding size: %s" % (self.biword_emb_dim)) logger.info(" Char embedding size: %s" % (self.char_emb_dim)) logger.info(" Gaz embedding size: %s" % (self.gaz_emb_dim)) logger.info(" Norm word emb: %s" % (self.norm_word_emb)) logger.info(" Norm biword emb: %s" % (self.norm_biword_emb)) logger.info(" Norm gaz emb: %s" % (self.norm_gaz_emb)) logger.info(" Norm gaz dropout: %s" % (self.gaz_dropout)) logger.info(" Train instance number: %s" % (len(self.train_texts))) logger.info(" Dev instance number: %s" % (len(self.dev_texts))) logger.info(" Test instance number: %s" % (len(self.test_texts))) logger.info(" Raw instance number: %s" % (len(self.raw_texts))) logger.info(" Hyperpara iteration: %s" % (self.HP_iteration)) logger.info(" Hyperpara batch size: %s" % (self.HP_batch_size)) logger.info(" Hyperpara lr: %s" % (self.HP_lr)) logger.info(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) logger.info(" Hyperpara HP_clip: %s" % (self.HP_clip)) logger.info(" Hyperpara momentum: %s" % (self.HP_momentum)) logger.info(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) logger.info(" Hyperpara dropout: %s" % (self.HP_dropout)) logger.info(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) logger.info(" Hyperpara bilstm: %s" % (self.HP_bilstm)) logger.info(" Hyperpara GPU: %s" % (self.HP_gpu)) logger.info(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) logger.info(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) logger.info(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "sentence": self.raw_texts, self.raw_Ids = read_instance_with_gaz_text( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file)) def write_decoded_results_back(self, predict_results, name): sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) result = [] for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] print(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') for idx in range(sent_num): sent_length = len(predict_results[idx]) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' for idy in range(sent_length): pre_su_item = predict_results[idx][idy].split('-') if pre_su_item[0] == 'S': data['start'] = str(idy) data['end'] = str(idy + 1) data['value'] = content_list[idx][0][idy].encode('utf-8') data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} if pre_su_item[0] == 'B': data['start'] = str(idy) value = value + (content_list[idx][0][idy].encode('utf-8')) if pre_su_item[0] == 'E': value = value + (content_list[idx][0][idy].encode('utf-8')) data['end'] = str(idy + 1) data['value'] = value data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' if pre_su_item[0] == 'I': value = value + (content_list[idx][0][idy].encode('utf-8')) return result def write_http_data(self, output_file, inputData, name): fout = open(output_file, 'w') get_num = len(inputData) start = 0 numOfParagram = int(math.ceil(get_num / 5.0)) num_start_sentence = start num_end_sentence = numOfParagram if name == "test": num_start_sentence = 0 num_end_sentence = numOfParagram elif name == "dev": num_start_sentence = numOfParagram num_end_sentence = numOfParagram * 2 elif name == "train": num_start_sentence = numOfParagram * 2 num_end_sentence = get_num for idx in range(num_start_sentence, num_end_sentence): text = inputData[idx]["text"] entities = inputData[idx]["entities"] idText = 1 inWord = False tagReady = False entity_name = '' for Text in text: ## content_list[idx] is a list with [word, char, label] tagReady = False for entity in entities: if not inWord: if entity['start'] + 1 == entity['end'] and entity[ 'end'] == idText: fout.write( Text.encode('utf-8') + " " + "S-" + entity['entity'].encode('utf-8') + '\n') tagReady = True break if entity['start'] + 1 == idText: fout.write( Text.encode('utf-8') + " " + "B-" + entity['entity'].encode('utf-8') + '\n') tagReady = True inWord = True entity_name = entity['entity'].encode('utf-8') break else: if entity['end'] == idText: fout.write( Text.encode('utf-8') + " " + "E-" + entity_name + '\n') tagReady = True inWord = False break if not tagReady: if not inWord: fout.write(Text.encode('utf-8') + " " + "O" + '\n') else: fout.write( Text.encode('utf-8') + " " + "I-" + entity_name + '\n') idText = idText + 1 fout.write('\n') fout.close() print("Predict input data has been written into file. %s" % (output_file))