class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) #self.simi_alphabet = Alphabet('simi') #添加计算相似度词语的信息 self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.gaz_count = {} self.gaz_split = {} self.biword_count = {} self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.HP_use_count = False self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.train_split_index = [] self.dev_split_index = [] self.use_bigram = True self.word_emb_dim = 200 self.biword_emb_dim = 200 self.char_emb_dim = 30 self.gaz_emb_dim = 200 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 #设置词典相似度相关的参数 self.simi_dic_emb = None #设置相似度的嵌入值 self.simi_dic_dim = 10 #设置相似度向量的纬度 self.use_dictionary = False # 设置当前是否使用词典 self.simi_list = [] #存储当前的每个字对应的相似度值 # self.use_gazcount = 'True' ### hyperparameters self.HP_iteration = 60 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 128 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_num_layer = 4 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r', encoding="utf-8").readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding="utf-8").readlines() seqlen = 0 for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[idx + 1].strip().split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) # biword_index = self.biword_alphabet.get_index(biword) self.biword_count[biword] = self.biword_count.get(biword, 0) + 1 for char in word: self.char_alphabet.add(char) #当前句子的长度 seqlen += 1 else: #出现空行则清零 seqlen = 0 #计算各个字表的长度 self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r', encoding="utf-8").readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") #def build_dict_alphabet( def build_gaz_alphabet(self, input_file, count=False): in_lines = open(input_file, 'r', encoding="utf-8").readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: #word_list为当前这个句子的所有字 w_length = len(word_list) entitys = [] #获取到了句子 for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) entitys += matched_entity for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) index = self.gaz_alphabet.get_index(entity) self.gaz_count[index] = self.gaz_count.get( index, 0) ## initialize gaz count #0表示若无想要的关键词则返回0,没有index这一个键值 if count: entitys.sort(key=lambda x: -len(x)) while entitys: longest = entitys[0] longest_index = self.gaz_alphabet.get_index(longest) #最长词的index加1 self.gaz_count[longest_index] = self.gaz_count.get( longest_index, 0) + 1 #把一个词语覆盖的词全部删掉 gazlen = len(longest) for i in range(gazlen): for j in range(i + 1, gazlen + 1): covering_gaz = longest[i:j] if covering_gaz in entitys: entitys.remove(covering_gaz) # print('remove:',covering_gaz) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.trans_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_trans = True self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_trans_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.HP_trans_hidden_dim)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() print(items) total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" print("feature_prefix:{}".format(feature_prefix)) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print("Find feature: ", feature_prefix) self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): print("Build alphabet......") in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.HP_trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r') as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r') as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":")) == 2: temp = line.strip().split(":") word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r', encoding="UTF-8").readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding='utf-8').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[idx + 1].strip().split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r', encoding="UTF-8").readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding="UTF-8").readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w', encoding="UTF-8") sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy] + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.max_sentence_length = 200 self.number_normalized = True self.norm_char_emb = True self.norm_gaz_emb = True self.dataset_name = 'msra' self.tagscheme = "NoSeg" self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', unkflag=False) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.train_ids = [] self.dev_ids = [] self.test_ids = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.char_emb_dim = 100 self.gaz_emb_dim = 100 self.pretrain_char_embedding = None self.pretrain_gaz_embedding = None self.dev_cut_num = 0 self.train_cut_num = 0 self.test_cut_num = 0 self.cut_num = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Dataset name: %s" % self.dataset_name) print(" Tag scheme: %s" % (self.tagscheme)) print(" Max Sentence Length: %s" % self.max_sentence_length) print(" Char alphabet size: %s" % self.char_alphabet.size()) print(" Gaz alphabet size: %s" % self.gaz_alphabet.size()) print(" Label alphabet size: %s" % self.label_alphabet.size()) print(" Char embedding size: %s" % self.char_emb_dim) print(" Gaz embedding size: %s" % self.gaz_emb_dim) print(" Number normalized: %s" % self.number_normalized) print(" Norm char emb: %s" % self.norm_char_emb) print(" Norm gaz emb: %s" % self.norm_gaz_emb) print(" Train instance number: %s" % (len(self.train_ids))) print(" Dev instance number: %s" % (len(self.dev_ids))) print(" Test instance number: %s" % (len(self.test_ids))) if self.cut_num != 0: print(" Train&Dev cut number: %s" % self.cut_num) else: print(" Train cut number: %s" % self.train_cut_num) print(" Dev cut number: %s" % self.dev_cut_num) print(" Test cut number: %s" % self.test_cut_num) print("DATA SUMMARY END.") sys.stdout.flush() def build_gaz_file(self, gaz_file, skip_first_row=False, separator=" "): ## build gaz file,initial read gaz embedding file if gaz_file: with open(gaz_file, 'r') as file: i = 0 for line in tqdm(file): if i == 0: i = i + 1 if skip_first_row: _ = line.strip() continue fin = line.strip().split(separator)[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") def fix_alphabet(self): self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_char_pretrain_emb(self, emb_path, skip_first_row=False, separator=" "): print("build char pretrain emb...") self.pretrain_char_embedding, self.char_emb_dim = \ build_pretrain_embedding(emb_path, self.char_alphabet, skip_first_row, separator, self.char_emb_dim, self.norm_char_emb) def build_gaz_pretrain_emb(self, emb_path, skip_first_row=True, separator=" "): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, skip_first_row, separator, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name, random_split=False): texts, ids, cut_num = read_instance(input_file, self.gaz, self.char_alphabet, self.label_alphabet, self.gaz_alphabet, self.number_normalized, self.max_sentence_length) if name == "train": if random_split: random.seed(1) ix = [i for i in range(len(ids))] train_ix = random.sample(ix, int(len(ids) * 0.9)) dev_ix = list(set(ix).difference(set(train_ix))) self.train_ids = [ids[ele] for ele in train_ix] self.dev_ids = [ids[ele] for ele in dev_ix] self.train_texts = [texts[ele] for ele in train_ix] self.dev_texts = [texts[ele] for ele in dev_ix] self.cut_num = cut_num else: self.train_ids = ids self.train_texts = texts self.train_cut_num = cut_num elif name == "dev": self.dev_ids = ids self.dev_texts = texts self.dev_cut_num = cut_num elif name == "test": self.test_ids = ids self.test_texts = texts self.test_cut_num = cut_num else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def get_tag_scheme(self): startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagscheme = "BMES" else: self.tagscheme = "BIO"
class Data: """ 所有数据预处理程序都封装在Data类里面 Data类中封装了Alphabet类,Alphabet类主要功能是word转成id,将id转成词 Alphabet类的构建是通过build_alphabet函数构建的 """ # def __init__(self): self.MAX_SENTENCE_LENGTH = 250 #句子最大长度 self.number_normalized = True #是否将数字归一化 self.norm_word_emb = True #是否将词向量归一化 self.word_alphabet = Alphabet('word') #word的词表与id self.label_alphabet = Alphabet('label', True) #not end "</unk>" #约定标注方式 self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.word_emb_dim = 50 self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 200 self.HP_batch_size = 32 # 1 self.HP_hidden_dim = 200 self.HP_dropout = 0.3 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = True # true self.HP_lr = 0.01 self.HP_lr_decay = 0.05 self.weight_decay = 0.00000005 self.use_clip = False self.HP_clip = 5.0 self.HP_momentum = 0 #控制优化器的一个超参 self.random_seed = 100 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" Use GPU: %s" % (self.HP_gpu)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size - 1)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print("--*--整体参数设定区域--*--") print(" Hyperpara random seed: %s" % (self.random_seed)) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara weight_decay: %s" % (self.weight_decay)) if self.use_clip: print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print("--*--LSTM参数设定区域--*--") print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer_num: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print("DATA SUMMARY END.") sys.stdout.flush() # 构建词典 def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: # True word = normalize_word(word) # 把字符中的数字变成0 label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) self.word_alphabet_size = self.word_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() #从数据中确定标签格式 startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): # 载入预训练词向量 print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def generate_instance(self, input_file, name): # 产生训练开发训练数据 # self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Use bigram: %s"%(self.use_bigram)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Biword alphabet size: %s"%(self.biword_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Gaz alphabet size: %s"%(self.gaz_alphabet.size())) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Biword embedding size: %s"%(self.biword_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Gaz embedding size: %s"%(self.gaz_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm biword emb: %s"%(self.norm_biword_emb)) print(" Norm gaz emb: %s"%(self.norm_gaz_emb)) print(" Norm gaz dropout: %s"%(self.gaz_dropout)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" Raw instance number: %s"%(len(self.raw_texts))) print(" Hyperpara iteration: %s"%(self.HP_iteration)) print(" Hyperpara batch size: %s"%(self.HP_batch_size)) print(" Hyperpara lr: %s"%(self.HP_lr)) print(" Hyperpara lr_decay: %s"%(self.HP_lr_decay)) print(" Hyperpara HP_clip: %s"%(self.HP_clip)) print(" Hyperpara momentum: %s"%(self.HP_momentum)) print(" Hyperpara hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyperpara dropout: %s"%(self.HP_dropout)) print(" Hyperpara lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyperpara bilstm: %s"%(self.HP_bilstm)) print(" Hyperpara GPU: %s"%(self.HP_gpu)) print(" Hyperpara use_gaz: %s"%(self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s"%(self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s"%(self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s"%(self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s"%(old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx+1]) > 2: biword = word + in_lines[idx+1].strip().split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print ("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print ("Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList(word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print ("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print ("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print ("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print ("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz(input_file, self.gaz,self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet,self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy]+ " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s"%(name, output_file))
class JointData: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.word_alphabet = Alphabet('word') self.label = [ "O", "B-A", "I-A", "B-O", "I-O", "B-E", "I-E", "B-T", "I-T", "B-C", "I-C" ] self.label_alphabet = Alphabet('label', True) self.sentence_type_alphabet = Alphabet('sentence', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.word_emb_file = None self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.use_pre_trained_model = None self.word_alphabet_size = 0 self.opinion_label_alphabet_size = 0 self.evidence_label_alphabet_size = 0 self.sentence_alphabet_size = 0 self.word_emb_dim = 50 self.lstm_input_size = 50 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_hidden_dim = 200 self.HP_attention_query_input_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): logger.info("++" * 50) logger.info("DATA SUMMARY START:") logger.info(" I/O:") logger.info(" Tag scheme: %s" % (self.tagScheme)) logger.info(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) logger.info(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) logger.info(" Number normalized: %s" % (self.number_normalized)) logger.info(" Word alphabet size: %s" % (self.word_alphabet_size)) logger.info(" Opinion Label alphabet size: %s" % (self.opinion_label_alphabet_size)) logger.info(" Evidence Label alphabet size: %s" % (self.evidence_label_alphabet_size)) logger.info(" Word embedding dir: %s" % (self.word_emb_dir)) logger.info(" Word embedding size: %s" % (self.word_emb_dim)) logger.info(" Norm word emb: %s" % (self.norm_word_emb)) logger.info(" Train file directory: %s" % (self.train_dir)) logger.info(" Dev file directory: %s" % (self.dev_dir)) logger.info(" Test file directory: %s" % (self.test_dir)) logger.info(" Raw file directory: %s" % (self.raw_dir)) logger.info(" Dset file directory: %s" % (self.dset_dir)) logger.info(" Model file directory: %s" % (self.model_dir)) logger.info(" Loadmodel directory: %s" % (self.load_model_dir)) logger.info(" Decode file directory: %s" % (self.decode_dir)) logger.info(" Train instance number: %s" % (len(self.train_texts))) logger.info(" Dev instance number: %s" % (len(self.dev_texts))) logger.info(" Test instance number: %s" % (len(self.test_texts))) logger.info(" Raw instance number: %s" % (len(self.raw_texts))) logger.info(" " + "++" * 20) logger.info(" Model Network:") logger.info(" Model use_crf: %s" % (self.use_crf)) logger.info(" Model word extractor: %s" % (self.word_feature_extractor)) logger.info(" " + "++" * 20) logger.info(" Training:") logger.info(" Optimizer: %s" % (self.optimizer)) logger.info(" Iteration: %s" % (self.HP_iteration)) logger.info(" BatchSize: %s" % (self.HP_batch_size)) logger.info(" Average batch loss: %s" % (self.average_batch_loss)) logger.info(" " + "++" * 20) logger.info(" Hyperparameters:") logger.info(" Hyper lr: %s" % (self.HP_lr)) logger.info(" Hyper lr_decay: %s" % (self.HP_lr_decay)) logger.info(" Hyper HP_clip: %s" % (self.HP_clip)) logger.info(" Hyper momentum: %s" % (self.HP_momentum)) logger.info(" Hyper l2: %s" % (self.HP_l2)) logger.info(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) logger.info(" Hyper attention_input: %s" % (self.HP_attention_query_input_dim)) logger.info(" Hyper dropout: %s" % (self.HP_dropout)) logger.info(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) logger.info(" Hyper bilstm: %s" % (self.HP_bilstm)) logger.info(" Hyper GPU: %s" % (self.HP_gpu)) logger.info("DATA SUMMARY END.") logger.info("++" * 50) sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding='utf-8').readlines() for line in in_lines: pairs = line.strip().split() if len(pairs) == 2: word = pairs[0] if sys.version_info[0] < 3: word = word.decode('utf-8') if self.number_normalized: word = normalize_word(word) self.word_alphabet.add(word) if len(pairs) == 1 and not line.strip() == "<end>": sentence_type = line.strip() self.sentence_type_alphabet.add(sentence_type) for l in self.label: self.label_alphabet.add(l) self.word_alphabet_size = self.word_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.sentence_alphabet_size = self.sentence_type_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.label_alphabet.close() self.sentence_type_alphabet.close() def build_pretrain_emb(self): if self.word_emb_dir: emb_path = os.path.join(self.word_emb_dir, self.word_emb_file) logger.info("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, emb_path)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def read_config(self, args): ## read data: self.train_dir = args.train_dir self.dev_dir = args.dev_dir self.test_dir = args.test_dir self.model_dir = args.model_dir self.word_emb_dir = args.word_emb_dir self.word_emb_file = args.word_emb_file self.norm_word_emb = str2bool(args.norm_word_emb) self.use_pre_trained_model = str2bool(str(args.use_pre_trained_model)) self.number_normalized = str2bool(args.number_normalized) self.seg = args.seg self.lstm_input_size = int(args.lstm_input_size) self.use_crf = str2bool(str(args.use_crf)) self.word_feature_extractor = args.word_seq_feature ## read training setting: self.optimizer = args.optimizer self.average_batch_loss = args.ave_batch_loss self.status = args.status self.HP_iteration = int(args.iteration) self.HP_batch_size = int(args.batch_size) self.HP_hidden_dim = int(args.hidden_dim) self.HP_attention_query_input_dim = int( args.attention_query_input_size) self.HP_dropout = float(args.dropout) self.HP_lstm_layer = int(args.lstm_layer) self.HP_bilstm = args.bilstm self.HP_gpu = args.gpu self.HP_lr = float(args.learning_rate) self.HP_lr_decay = float(args.lr_decay) self.HP_momentum = float(args.momentum) self.HP_l2 = float(args.l2) self.clip_grad = float(args.clip_grad) self.label_embedding_scale = float(args.label_embedding_scale) self.num_attention_head = int(args.num_attention_head) self.whether_clip_grad = str2bool(args.whether_clip_grad)
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.norm_word_emb = True self.norm_biword_emb = True self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.pos_alphabet = Alphabet('pos') self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = False self.word_emb_dim = 50 self.biword_emb_dim = 50 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.label_alphabet_size = 0 # hyperparameters self.HP_iteration = 100 self.HP_batch_size = 16 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.2 self.HP_lstmdropout = 0 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 # attention self.tencent_word_embed_dim = 200 self.pos_embed_dim = 200 self.cross_domain = False self.cross_test = False self.use_san = False self.use_cnn = False self.use_attention = True self.pos_to_idx = {} self.external_pos = {} self.token_replace_prob = {} self.use_adam = False self.use_bert = False self.use_warmup_adam = False self.use_sgd = False self.use_adadelta = False self.use_window = True self.mode = 'train' self.use_tencent_dic = False # cross domain file self.computer_file = "" self.finance_file = "" self.medicine_file = "" self.literature_file = "" def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Char alphabet size: %s" % (self.word_alphabet_size)) print(" BiChar alphabet size: %s" % (self.biword_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Char embedding size: %s" % (self.word_emb_dim)) print(" BiChar embedding size: %s" % (self.biword_emb_dim)) print(" Norm char emb: %s" % (self.norm_word_emb)) print(" Norm bichar emb: %s" % (self.norm_biword_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Cross domain: %s" % self.cross_domain) print(" Hyperpara use window: %s" % self.use_window) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split('\t') # word = pairs[0].decode('utf-8') word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1][0] + '-SEG' self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: # biword = word + in_lines[idx + 1].strip('\t').split()[0].decode('utf-8') biword = word + in_lines[idx + 1].strip('\t').split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_word_vec_100(self): self.pretrain_word_embedding, self.pretrain_biword_embedding = self.get_embedding( ) self.word_emb_dim, self.biword_emb_dim = 100, 100 # get pre-trained embeddings def get_embedding(self, size=100): fname = 'data/wordvec_' + str(size) print("build pretrain word embedding from: ", fname) word_init_embedding = np.zeros(shape=[self.word_alphabet.size(), size]) bi_word_init_embedding = np.zeros( shape=[self.biword_alphabet.size(), size]) pre_trained = gensim.models.KeyedVectors.load(fname, mmap='r') # pre_trained_vocab = set([unicode(w.decode('utf8')) for w in pre_trained.vocab.keys()]) pre_trained_vocab = set([w for w in pre_trained.vocab.keys()]) c = 0 for word, index in self.word_alphabet.iteritems(): if word in pre_trained_vocab: word_init_embedding[index] = pre_trained[word] else: word_init_embedding[index] = np.random.uniform(-0.5, 0.5, size) c += 1 for word, index in self.biword_alphabet.iteritems(): bi_word_init_embedding[index] = ( word_init_embedding[self.word_alphabet.get_index(word[0])] + word_init_embedding[self.word_alphabet.get_index(word[1])]) / 2 # word_init_embedding[word2id[PAD]] = np.zeros(shape=size) # bi_word_init_embedding[] print('oov character rate %f' % (float(c) / self.word_alphabet.size())) return word_init_embedding, bi_word_init_embedding def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy] + "\t" + predict_results[idx][idy][0] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_char_emb = True self.norm_bichar_emb = True self.norm_gaz_emb = False self.use_single = False self.char_alphabet = Alphabet('char') self.bichar_alphabet = Alphabet('bichar') self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower, self.use_single) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bichar = False self.char_emb_dim = 50 self.bichar_emb_dim = 50 self.gaz_emb_dim = 50 self.posi_emb_dim = 30 self.gaz_dropout = 0.5 self.pretrain_char_embedding = None self.pretrain_bichar_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.char_alphabet_size = 0 self.bichar_alphabet_size = 0 self.character_alphabet_size = 0 self.label_alphabet_size = 0 # hyper parameters self.HP_iteration = 100 self.HP_batch_size = 1 # self.HP_char_hidden_dim = 50 # int. Character hidden vector dimension for character sequence layer. self.HP_hidden_dim = 200 # int. Char hidden vector dimension for word sequence layer. self.HP_dropout = 0.5 # float. Dropout probability. self.HP_lstm_layer = 1 # int. LSTM layer number for word sequence layer. self.HP_bilstm = True # boolen. If use bidirection lstm for word seuquence layer. self.HP_gpu = False # Word level LSTM models (e.g. char LSTM + word LSTM + CRF) would prefer a `lr` around 0.015. # Word level CNN models (e.g. char LSTM + word CNN + CRF) would prefer a `lr` around 0.005 and with more iterations. self.HP_lr = 0.015 self.HP_lr_decay = 0.05 # float. Learning rate decay rate, only works when optimizer=SGD. self.HP_clip = 1.0 # float. Clip the gradient which is larger than the setted number. self.HP_momentum = 0 # float. Momentum self.HP_use_posi = False self.HP_num_layer = 4 self.HP_rethink_iter = 2 self.model_name = 'CNN_model' self.posi_alphabet_size = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bichar)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Bichar alphabet size: %s" % (self.bichar_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.char_emb_dim)) print(" Bichar embedding size: %s" % (self.bichar_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_char_emb)) print(" Norm bichar emb: %s" % (self.norm_bichar_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() start_s = False start_b = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): start_s = True elif "B-" in label.upper(): start_b = True if start_b: if start_s: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) # "陈 B-PER" def build_alphabet(self, input_file): if input_file is None or not os.path.isfile(input_file): # print('[' + sys._getframe().f_code.co_name + '] file ' + str(input_file) + "can not be found or is not a file address") return with codecs.open(input_file, 'r', 'utf-8') as fr: in_lines = fr.readlines() seqlen = 0 for idx in range(len(in_lines)): line = in_lines[idx] # '陈 B-PER\n' # 行不空 则加入label word bichar char if len(line) > 2: # if sequence labeling data format i.e. CoNLL 2003 pairs = line.strip().split() # list ['陈','B-PER'] char = pairs[0] # '陈' if self.number_normalized: # 数字转0 char = normalize_char(char) label = pairs[-1] # "B-PER" # build feature alphabet self.label_alphabet.add(label) self.char_alphabet.add(char) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: bichar = char + in_lines[idx + 1].strip().split()[0] # 陈元 else: bichar = char + NULLKEY self.bichar_alphabet.add(bichar) seqlen += 1 else: self.posi_alphabet_size = max(seqlen, self.posi_alphabet_size) seqlen = 0 self.char_alphabet_size = self.char_alphabet.size() self.bichar_alphabet_size = self.bichar_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() start_s = False start_b = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): start_s = True elif "B-" in label.upper(): start_b = True if start_b: if start_s: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): # build gaz file, initial read gaz embedding file if gaz_file: with codecs.open(gaz_file, 'r', 'utf-8') as fr: fins = fr.readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print('[' + sys._getframe().f_code.co_name + '] ' + "Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): if input_file is None or not os.path.isfile(input_file): # print('[' + sys._getframe().f_code.co_name + '] file ' + str(input_file) + "can not be found or is not a file address") return with codecs.open(input_file, 'r', 'utf-8') as fr: in_lines = fr.readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_char(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) # Alphabet def fix_alphabet(self): self.char_alphabet.close() # alphabet.keep_growing=False self.bichar_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_char_pretrain_emb(self, emb_path): print("build char pretrain emb...") self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def build_bichar_pretrain_emb(self, emb_path): print("build bichar pretrain emb...") self.pretrain_bichar_embedding, self.bichar_emb_dim = build_pretrain_embedding( emb_path, self.bichar_alphabet, self.bichar_emb_dim, self.norm_bichar_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def generate_instance_with_gaz_2(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz_3(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))