def map_string_2_id_open(string_list, name): string_id_list = [] alphabet_string = Alphabet(name) for strings in string_list: ids = [] for string in strings: id = alphabet_string.get_index(string) ids.append(id) string_id_list.append(ids) alphabet_string.close() return string_id_list, alphabet_string
class Data: def __init__(self): self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos'] self.substring_maxlen = 10 self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_names = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### self.task_name = None ### I/O self.data_bin_dir = None self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.middle_dir = None self.viterbi_inputs_model_name = None self.trans_dir = None self.decode_dir = None self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.typeinfo_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Classification ## Dataset Plus self.substring_dir = None self.bpe_emb_dir = None self.pos_emb_dir = None self.pretrain_bpe_embedding = None self.pretrain_pos_embedding = None self.bpe_emb_dim = 30 self.pos_emb_dim = 30 self.bpe_alphabet_size = 0 self.pos_alphabet_size = 0 self.norm_bpe_emb = False self.norm_pos_emb = False self.bpe_texts = [] self.bpe_Ids = [] self.pos_texts = [] self.pos_Ids = [] self.label_size = 0 self.substring_train_texts = None self.substring_train_Ids = None self.substring_dev_texts = None self.substring_dev_Ids = None self.substring_test_texts = None self.substring_test_Ids = None self.substring_label_alphabet = Alphabet('substring_label', True) ###Networks self.word_feature_extractor = "LSTM" # "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" # "LSTM"/"CNN"/"GRU"/None self.use_trans = False self.use_crf = True self.nbest = None self.use_mapping = False self.mapping_func = None # tanh or sigmoid # Training self.save_model = True self.state_training_name = 'default' self.average_batch_loss = False self.optimizer = "SGD" # "SGD"/"Adam" self.status = "train" self.show_loss_per_batch = 100 # Hyperparameters self.seed_num = None self.cnn_layer = 4 self.iteration = 100 self.batch_size = 10 self.char_hidden_dim = 50 self.trans_hidden_dim = 50 self.hidden_dim = 200 self.dropout = 0.5 self.lstm_layer = 1 self.bilstm = True self.gpu = False self.lr = 0.015 self.lr_decay = 0.05 self.clip = None self.momentum = 0 self.l2 = 1e-8 # circul self.circul_time = 4 self.circul_deepth = 2 self.circul_gather_output_mode = "concat" # decode prepare self.decode_prepare_mode = 'example' def init_substring_instance(self): len_names = len(self.substring_names) self.substring_train_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print("++" * 50) print(" task name: %s" % (self.task_name)) print("++" * 50) print(" Data bin file directory: %s" % (self.data_bin_dir)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Middle file directory: %s" % (self.middle_dir)) print(" viterbi inputs model name: %s" % (self.viterbi_inputs_model_name)) if self.typeinfo_dir: print(" typeinfo directory: %s" % (self.typeinfo_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.trans_hidden_dim)) if self.use_mapping: print(" Model mapping function: %s" % (self.mapping_func)) print(" " + "++" * 20) print(" Training:") print(" show_loss_per_batch: %s" % (self.show_loss_per_batch)) print(" save_model: %s" % (self.save_model)) print(" state_training_name: %s" % (self.state_training_name)) print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.iteration)) print(" BatchSize: %s" % (self.batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper seed_num: %s" % (self.seed_num)) print(" Hyper lr: %s" % (self.lr)) print(" Hyper lr_decay: %s" % (self.lr_decay)) print(" Hyper clip: %s" % (self.clip)) print(" Hyper momentum: %s" % (self.momentum)) print(" Hyper l2: %s" % (self.l2)) print(" Hyper hidden_dim: %s" % (self.hidden_dim)) print(" Hyper dropout: %s" % (self.dropout)) print(" Hyper lstm_layer: %s" % (self.lstm_layer)) print(" Hyper bilstm: %s" % (self.bilstm)) print(" Hyper GPU: %s" % (self.gpu)) print("DATA SUMMARY END.") print("++" * 50) print(" substring dir : %s" % (self.substring_dir)) print(" bpe_emb_dir dir : %s" % (self.bpe_emb_dir)) print(" pos_emb_dir dir : %s" % (self.pos_emb_dir)) print("++" * 50) print(" circul time : %s" % (self.circul_time)) print(" circul deepth : %s" % (self.circul_deepth)) print(" gather output mode : %s" % (self.circul_gather_output_mode)) print("++" * 50) print(" decode prepare mode : %s" % (self.decode_prepare_mode)) print("++" * 50) sys.stdout.flush() def make_substring_label_alphabet(self): for label in self.label_alphabet.instances: label = label.split('-')[-1] self.substring_label_alphabet.add(label) self.substring_label_alphabet.close() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = 'feature_' + str(idx) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_names.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): self.feature_emb_dims[idx] = self.feat_config[ self.feature_names[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_names[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_names[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_alphabet_substring(self, input_file_dir, substring_file_prefix): ## will not read lables input_files = os.listdir(input_file_dir) print input_files for input_file in input_files: plus_feature = '' input_file_name = os.path.split(input_file)[1] if input_file_name.split('.')[0] != substring_file_prefix: continue if 'bpe' in input_file_name: plus_feature = 'bpe' elif 'word' in input_file_name: plus_feature = 'word' if plus_feature == '': continue in_lines = open(input_file_dir + input_file, 'r').readlines() for line in in_lines: if len(line.strip()) > 0: pairs = line.strip().split('\t') words = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: words = normalize_word(words) labels = pairs[-1] for word in words.split(): self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.typeinfo_dir: type_info_matrix = [] with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file: type_info_lines = typeinfo_file.readlines() for line in type_info_lines: line = line.rstrip().split() for i, _ in enumerate(line): line[i] = float(line[i]) line = np.array(line) type_info_matrix.append(line) print( "Caculate type info distribution,and concate word and type......" ) cos_res = [] for i, word_embed in enumerate(self.pretrain_word_embedding): word_type_info = [] if i == 0: word_type_info = np.random.random( size=len(type_info_matrix)) cos_res.append(word_type_info) else: for type_info in type_info_matrix: cos_sim = 1 - spatial.distance.cosine( word_embed, type_info) word_type_info.append(cos_sim) cos_res.append(word_type_info) cos_res = np.array(cos_res) cos_res = sigmoid(cos_res) self.pretrain_word_embedding = np.concatenate( [self.pretrain_word_embedding, cos_res], axis=1) print "type info length:{}".format(len(type_info_matrix)) self.word_emb_dim += len(type_info_matrix) print "new word dim is :{}".format(self.word_emb_dim) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_substring(self, substring_file_prefix): self.init_substring_instance() self.make_substring_label_alphabet() input_files = os.listdir(self.substring_dir) print input_files for input_file in input_files: input_file_name = os.path.split(input_file)[1] input_file_dir = os.path.join(self.substring_dir, input_file_name) input_file_name_split = input_file_name.split('.') if input_file_name_split[0] != substring_file_prefix: continue print('dealing %s' % (input_file_name)) name = input_file_name_split[1] feature_name = input_file_name_split[2] f_l = int(input_file_name_split[-1][3:]) #feature_len if feature_name == 'word': alphabet = self.word_alphabet elif feature_name == 'char': alphabet = self.char_alphabet elif feature_name == 'pos': alphabet = self.feature_alphabets[0] elif feature_name == 'bpe': alphabet = self.feature_alphabets[1] s_f_id = self.substring_names.index( feature_name) #substring_feature_id if name == "train": self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testa": self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testb": self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) else: print( "Error: you can only generate train/testa/testb instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## task: the_item = 'task_name' if the_item in config: self.task_name = config[the_item] ## read data: the_item = 'data_bin_dir' if the_item in config: self.data_bin_dir = config[the_item] the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'middle_dir' if the_item in config: self.middle_dir = config[the_item] the_item = 'viterbi_inputs_model_name' if the_item in config: self.viterbi_inputs_model_name = config[the_item] the_item = 'substring_dir' if the_item in config: self.substring_dir = config[the_item] the_item = 'bpe_emb_dir' if the_item in config: self.bpe_emb_dir = config[the_item] the_item = 'pos_emb_dir' if the_item in config: self.pos_emb_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'typeinfo_dir' if the_item in config: self.typeinfo_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'use_mapping' if the_item in config: self.use_mapping = str2bool(config[the_item]) the_item = 'mapping_func' if the_item in config: self.mapping_func = config[the_item] the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'save_model' if the_item in config: self.save_model = str2bool(config[the_item]) the_item = 'state_training_name' if the_item in config: self.state_training_name = config[the_item] the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] the_item = 'show_loss_per_batch' if the_item in config: self.show_loss_per_batch = int(config[the_item]) ## read Hyperparameters: the_item = 'seed_num' if the_item in config: if config[the_item] != 'None': self.seed_num = int(config[the_item]) the_item = 'cnn_layer' if the_item in config: self.cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: if config[the_item] == 'None': self.clip = None else: self.clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.l2 = float(config[the_item]) ###base2 the_item = 'feature_name' if the_item in config: self.feature_name = config[the_item] the_item = 'feature_length' if the_item in config: self.feature_length = int(config[the_item]) the_item = 'class_num' if the_item in config: self.class_num = int(config[the_item]) the_item = 'feature_ans' if the_item in config: self.feature_ans = config[the_item] ###circul the_item = 'circul_time' if the_item in config: self.circul_time = config[the_item] the_item = 'circul_deepth' if the_item in config: self.circul_deepth = config[the_item] the_item = 'circul_gather_output_mode' if the_item in config: self.circul_gather_output_mode = config[the_item] ###decode_prepare the_item = 'decode_prepare_mode' if the_item in config: self.decode_prepare_mode = config[the_item] def read_arg(self, args): if args.task_name != None: self.task_name = args.task_name if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir if args.train_dir != None: self.train_dir = args.train_dir if args.dev_dir != None: self.dev_dir = args.dev_dir if args.test_dir != None: self.test_dir = args.test_dir if args.trans_dir != None: self.trans_dir = args.trans_dir if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir if args.trans_embed_dir != None: self.trans_embed_dir = args.trans_embed_dir if args.middle_dir != None: self.middle_dir = args.middle_dir if args.viterbi_inputs_model_name != None: self.viterbi_inputs_model_name = args.viterbi_inputs_model_name if args.substring_dir != None: self.substring_dir = args.substring_dir if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir if args.model_dir != None: self.model_dir = args.model_dir if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim if args.number_normalized != None: self.number_normalized = args.number_normalized if args.seg != None: self.seg = args.seg if args.use_crf != None: self.use_crf = args.use_crf if args.use_char != None: self.use_char = args.use_char if args.use_trans != None: self.use_trans = args.use_trans if args.word_seq_feature != None: self.word_seq_feature = args.word_seq_feature if args.char_seq_feature != None: self.char_seq_feature = args.char_seq_feature if args.nbest != None: self.nbest = args.nbest if args.status != None: self.status = args.status if args.state_training_name != None: self.state_training_name = args.state_training_name if args.save_model != None: self.save_model = args.save_model if args.optimizer != None: self.optimizer = args.optimizer if args.iteration != None: self.iteration = args.iteration if args.batch_size != None: self.batch_size = args.batch_size if args.ave_batch_loss != None: self.ave_batch_loss = args.ave_batch_loss if args.show_loss_per_batch != None: self.show_loss_per_batch = args.show_loss_per_batch if args.seed_num != None: self.seed_num = args.seed_num if args.cnn_layer != None: self.cnn_layer = args.cnn_layer if args.char_hidden_dim != None: self.char_hidden_dim = args.char_hidden_dim if args.trans_hidden_dim != None: self.trans_hidden_dim = args.trans_hidden_dim if args.hidden_dim != None: self.hidden_dim = args.hidden_dim if args.dropout != None: self.dropout = args.dropout if args.lstm_layer != None: self.lstm_layer = args.lstm_layer if args.bilstm != None: self.bilstm = args.bilstm if args.learning_rate != None: self.learning_rate = args.learning_rate if args.lr_decay != None: self.lr_decay = args.lr_decay if args.momentum != None: self.momentum = args.momentum if args.l2 != None: self.l2 = args.l2 if args.gpu != None: self.gpu = args.gpu if args.clip != None: self.clip = args.clip ###base2 if args.feature_name != None: self.feature_name = args.feature_name if args.feature_length != None: self.feature_length = args.feature_length if args.class_num != None: self.class_num = args.class_num if args.feature_ans != None: self.feature_ans = args.feature_ans ###circul if args.circul_time != None: self.circul_time = args.circul_time if args.circul_deepth != None: self.circul_deepth = args.circul_deepth if args.circul_gather_output_mode != None: self.circul_gather_output_mode = args.circul_gather_output_mode ###decode_prepare if args.decode_prepare_mode != None: self.decode_prepare_mode = args.decode_prepare_mode def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":", 1)) == 2: temp = line.strip().split(":", 1) word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) if ids == []: ids = [0] translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=80): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 return C def build_char_embedd_table(): logger.info('Dimension of char embedding dim is ' + str(char_embedd_dim)) scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform( -scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes( sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes( sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes( sentences_test) # close character alphabet char_alphabet.close() logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min( MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train = construct_tensor_char(index_sentences_train) C_dev = construct_tensor_char(index_sentences_dev) C_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table()
class Template: def __init__(self, args): self.config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) if args.dataset not in self.config['data_list']: raise KeyError("No such dataset named {}.".format(args.dataset)) self.config['dataset'] = args.dataset self.datatype = 'binary' if self.config['dataset'] in self.config['datatype']['train_test']: self.datatype = 'train_test' self.alphabet = Alphabet('word') self.set_seed() def set_seed(self): np.random.seed(self.config['seed']) random.seed(self.config['seed']) def clean_str_sst(self, string): """ Tokenization/string cleaning for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def clean_str(self, string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ if self.config['dataset'].startswith('SST'): return self.clean_str_sst(string) string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip().lower() def read_split_file(self, mode): filelist = self.config['data_list'][self.config['dataset']] try: filename = os.path.join(self.config['dirname'], self.config['dataset'], filelist[mode]) except: return None a = open(filename, 'r', encoding='utf-8') res = [] for line in a: label, text = int(line[0]), self.clean_str(line[1:]).split() res.append((text, label)) return res def read_binary_file(self): filelist = self.config['data_list'][self.config['dataset']] modes = ['pos', 'neg'] labels = {'pos': 1, 'neg': 0} res = [] for mode in modes: filename = os.path.join(self.config['dirname'], self.config['dataset'], filelist[mode]) # print(filename) a = open(filename, 'r', encoding='latin1').read().splitlines() for line in a: line = self.clean_str(line) res.append((line.split(), labels[mode])) random.shuffle(res) return res # X, y = zip(*res) # train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=self.config['valid_rate']) def normalize_word(self, word): new_word = "" for char in word: if char.isdigit(): new_word += '0' else: new_word += char return new_word def execute(self, data_list): res_list = {} for key, data in data_list.items(): cur_res = [] for line, label in data: res_line = [] for word in line: word = self.normalize_word(word) res_line.append(self.alphabet.get_index(word)) cur_res.append((res_line, label)) # self.alphabet.close() res_list[key] = cur_res return res_list def load_pretrain_emb(self, embedding_path, skip_first_row, separator): embedd_dim = -1 embedd_dict = dict() if os.path.exists(embedding_path[0]): embedding_path = embedding_path[0] else: embedding_path = embedding_path[1] with open(embedding_path, 'r', encoding='utf-8') as file: i = 0 j = 0 for line in tqdm(file, total=3e6): if i == 0: i = i + 1 if skip_first_row: _ = line.strip() continue j = j + 1 line = line.strip() if len(line) == 0: continue tokens = line.split(separator) if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: if embedd_dim + 1 == len(tokens): embedd = np.empty([1, embedd_dim]) embedd[:] = tokens[1:] embedd_dict[tokens[0]] = embedd else: continue return embedd_dict, embedd_dim, embedding_path def norm2one(self, vec): root_sum_square = np.sqrt(np.sum(np.square(vec))) return vec / root_sum_square def build_pretrain_embedding(self, embedding_path, alphabet, skip_first_row=True, separator=" ", embedd_dim=300, norm=True): embedd_dict = dict() if embedding_path != None: embedd_dict, embedd_dim, embedding_path = self.load_pretrain_emb(embedding_path, skip_first_row, separator) scale = np.sqrt(3.0 / embedd_dim) pretrain_emb = np.empty([alphabet.size(), embedd_dim]) perfect_match = 0 case_match = 0 not_match = 0 for alph, index in alphabet.iteritems(): if alph in embedd_dict: if norm: pretrain_emb[index, :] = self.norm2one(embedd_dict[alph]) else: pretrain_emb[index, :] = embedd_dict[alph] perfect_match += 1 elif alph.lower() in embedd_dict: if norm: pretrain_emb[index, :] = self.norm2one(embedd_dict[alph.lower()]) else: pretrain_emb[index, :] = embedd_dict[alph.lower()] case_match += 1 else: pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedd_dim]) not_match += 1 pretrained_size = len(embedd_dict) print("Embedding: %s\n pretrain num:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % ( embedding_path, pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / alphabet.size())) pretrain_emb = np.float32(pretrain_emb) self.alphabet.pretrained_emb = pretrain_emb return pretrain_emb, embedd_dim def run_read_file(self): data_list = [] if self.datatype == 'train_test': modes = ['train', 'valid', 'test'] data_list = list(map(self.read_split_file, modes)) if data_list[1] is None: X, y = zip(*data_list[0]) train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=self.config['valid_rate']) data_list[0] = list(zip(train_x, train_y)) data_list[1] = list(zip(valid_x, valid_y)) data_list = { 'train': data_list[0], 'valid': data_list[1], 'test': data_list[2] } elif self.datatype == 'binary': datalist = self.read_binary_file() X, y = zip(*datalist) kf = StratifiedKFold(n_splits=self.config['kfold'], shuffle=True) data_list = [] for train_index, test_index in kf.split(X, y): train_x = [X[w] for w in train_index] train_y = [y[w] for w in train_index] test_x = [X[w] for w in test_index] test_y = [y[w] for w in test_index] temp = {'train': list(zip(train_x, train_y)), 'test': list(zip(test_x, test_y))} temp['valid'] = temp['test'] data_list.append(temp) return data_list def forward(self): data_list = self.run_read_file() if isinstance(data_list, list): processed_list = list(map(self.execute, data_list)) else: processed_list = self.execute(data_list) pretrained_emb, emb_dim = self.build_pretrain_embedding(self.config['embedding_path'], self.alphabet, norm=True) pkl.dump((processed_list, self.alphabet, pretrained_emb, emb_dim), open(self.config['res_path'].format(self.config['dataset']), 'wb'))
def generate_character_data(sentences_train, sentences_dev, sentences_test, max_sent_length, char_embedd_dim=30): """ generate data for charaters :param sentences_train: :param sentences_dev: :param sentences_test: :param max_sent_length: :return: C_train, C_dev, C_test, char_embedd_table """ def get_character_indexes(sentences): index_sentences = [] max_length = 0 for words in sentences: index_words = [] for word in words: index_chars = [] if len(word) > max_length: max_length = len(word) for char in word[:MAX_CHAR_LENGTH]: char_id = char_alphabet.get_index(char) index_chars.append(char_id) index_words.append(index_chars) index_sentences.append(index_words) return index_sentences, max_length def construct_tensor_char(index_sentences): C = np.empty([len(index_sentences), max_sent_length, max_char_length], dtype=np.int32) word_end_id = char_alphabet.get_index(word_end) for i in range(len(index_sentences)): words = index_sentences[i] sent_length = len(words) for j in range(sent_length): chars = words[j] char_length = len(chars) for k in range(char_length): cid = chars[k] C[i, j, k] = cid # fill index of word end after the end of word C[i, j, char_length:] = word_end_id # Zero out C after the end of the sentence C[i, sent_length:, :] = 0 return C def build_char_embedd_table(): scale = np.sqrt(3.0 / char_embedd_dim) char_embedd_table = np.random.uniform(-scale, scale, [char_alphabet.size(), char_embedd_dim]).astype( theano.config.floatX) return char_embedd_table char_alphabet = Alphabet('character') char_alphabet.get_index(word_end) index_sentences_train, max_char_length_train = get_character_indexes(sentences_train) index_sentences_dev, max_char_length_dev = get_character_indexes(sentences_dev) index_sentences_test, max_char_length_test = get_character_indexes(sentences_test) # close character alphabet char_alphabet.close() logger.info("character alphabet size: %d" % (char_alphabet.size() - 1)) max_char_length = min(MAX_CHAR_LENGTH, max(max_char_length_train, max_char_length_dev, max_char_length_test)) logger.info("Maximum character length of training set is %d" % max_char_length_train) logger.info("Maximum character length of dev set is %d" % max_char_length_dev) logger.info("Maximum character length of test set is %d" % max_char_length_test) logger.info("Maximum character length used for training is %d" % max_char_length) # fill character tensor C_train = construct_tensor_char(index_sentences_train) C_dev = construct_tensor_char(index_sentences_dev) C_test = construct_tensor_char(index_sentences_test) return C_train, C_dev, C_test, build_char_embedd_table()
class VsmNormer(nn.Module): def __init__(self): super(VsmNormer, self).__init__() self.word_alphabet = Alphabet('word') self.embedding_dim = None self.word_embedding = None self.dict_alphabet = Alphabet('dict') self.dict_embedding = None self.gpu = opt.gpu def transfer_model_into_gpu(self): if torch.cuda.is_available(): self.word_embedding = self.word_embedding.cuda(self.gpu) self.dict_embedding = self.dict_embedding.cuda(self.gpu) def batch_name_to_ids(self, name): tokens = my_tokenize(name) length = len(tokens) tokens_id = np.zeros((1, length), dtype=np.int) for i, word in enumerate(tokens): word = norm_utils.word_preprocess(word) tokens_id[0][i] = self.word_alphabet.get_index(word) tokens_id = torch.from_numpy(tokens_id) if torch.cuda.is_available(): return tokens_id.cuda(self.gpu) else: return tokens_id def init_vector_for_dict(self, meddra_dict): self.dict_embedding = nn.Embedding(len(meddra_dict), self.embedding_dim) if torch.cuda.is_available(): self.dict_embedding = self.dict_embedding.cuda(self.gpu) for concept_id, concept_name in meddra_dict.items(): self.dict_alphabet.add(concept_id) with torch.no_grad(): tokens_id = self.batch_name_to_ids(concept_name) length = tokens_id.size(1) emb = self.word_embedding(tokens_id) emb = emb.unsqueeze_(1) pool = functional.avg_pool2d(emb, (length, 1)) index = norm_utils.get_dict_index(self.dict_alphabet, concept_id) self.dict_embedding.weight.data[index] = pool[0][0] def compute_similarity(self, mention_rep, concep_rep): # mention_rep is (batch, emb_dim) and concep_rep is (concept_num, emb_dim) mention_rep_norm = torch.norm(mention_rep, 2, 1, True) # batch 1 concep_rep_norm = torch.norm(concep_rep, 2, 1, True) # concept 1 a = torch.matmul(mention_rep_norm, torch.t(concep_rep_norm)) # batch, concept a = a.clamp(min=1e-8) b = torch.matmul(mention_rep, torch.t(concep_rep)) # batch, concept return b / a def forward(self, mention_word_ids): length = mention_word_ids.size(1) mention_word_emb = self.word_embedding(mention_word_ids) mention_word_emb = mention_word_emb.unsqueeze_(1) mention_word_pool = functional.avg_pool2d(mention_word_emb, (length, 1)) # batch,1,1,100 mention_word_pool = mention_word_pool.squeeze_(1).squeeze_( 1) # batch,100 # similarities = torch.t(torch.matmul(self.dict_embedding.weight.data, torch.t(mention_word_pool))) # batch, dict similarities = self.compute_similarity(mention_word_pool, self.dict_embedding.weight.data) values, indices = torch.max(similarities, 1) return values, indices def process_one_doc(self, doc, entities, dict): for entity in entities: with torch.no_grad(): tokens_id = self.batch_name_to_ids(entity.name) values, indices = self.forward(tokens_id) norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices.item()) name = dict[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(name) entity.norm_confidences.append(values.item())
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.trans_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.typeinfo_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Networks self.word_feature_extractor = "LSTM" # "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" # "LSTM"/"CNN"/"GRU"/None self.use_trans = False self.use_crf = True self.nbest = None self.use_mapping = False self.mapping_func = None # tanh or sigmoid # Training self.average_batch_loss = False self.optimizer = "SGD" # "SGD"/"Adam" self.status = "train" # Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_trans_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) if self.typeinfo_dir: print(" typeinfo directory: %s" % (self.typeinfo_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.HP_trans_hidden_dim)) if self.use_mapping: print(" Model mapping function: %s" % (self.mapping_func)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.typeinfo_dir: type_info_matrix = [] with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file: type_info_lines = typeinfo_file.readlines() for line in type_info_lines: line = line.rstrip().split() for i, _ in enumerate(line): line[i] = float(line[i]) line = np.array(line) type_info_matrix.append(line) print( "Caculate type info distribution,and concate word and type......" ) cos_res = [] for i, word_embed in enumerate(self.pretrain_word_embedding): word_type_info = [] if i == 0: word_type_info = np.random.random( size=len(type_info_matrix)) cos_res.append(word_type_info) else: for type_info in type_info_matrix: cos_sim = 1 - spatial.distance.cosine( word_embed, type_info) word_type_info.append(cos_sim) cos_res.append(word_type_info) cos_res = np.array(cos_res) cos_res = sigmoid(cos_res) self.pretrain_word_embedding = np.concatenate( [self.pretrain_word_embedding, cos_res], axis=1) print "type info length:{}".format(len(type_info_matrix)) self.word_emb_dim += len(type_info_matrix) print "new word dim is :{}".format(self.word_emb_dim) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'typeinfo_dir' if the_item in config: self.typeinfo_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'use_mapping' if the_item in config: self.use_mapping = str2bool(config[the_item]) the_item = 'mapping_func' if the_item in config: self.mapping_func = config[the_item] the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.HP_trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":", 1)) == 2: temp = line.strip().split(":", 1) word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) if ids == []: ids = [0] translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
class Data: def __init__(self, input_file): self.original_data = open(input_file, 'r').readlines() self.index_data = [] self.word_alphabet = Alphabet('word') self.gloss_alphabet = Alphabet('gloss') self.entity_alphabet = Alphabet('entity') self.gaz_alphabet = Alphabet('gaz') self.label_alphabet = Alphabet('label') self.word_alphabet_size = 0 self.gloss_alphabet_size = 0 self.entity_alphabet_size = 0 self.gaz_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_gaz_hidden_dim = 50 self.HP_lstm_hidden_dim = 200 self.HP_dropout = 0.5 self.gaz_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = False self.HP_use_entity = False self.HP_use_gloss = True self.HP_use_gaz = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_iteration = 100 # embedding hyperparameter self.word_emb_dim = 200 self.entity_emb_dim = 50 self.gloss_features = "CNN" #["CNN","LSTM"] self.gloss_emb_dim = 200 self.gloss_hidden_dim = 300 self.pretrain_word_embedding = np.array([]) self.pretrain_gaz_embedding = None self.word_embed_path = "../LOVECC/NYM.6B.200d.txt" #"NYM_200.txt" self.gaz_embed_path = None self.gaz_emb_dim = 200 self.HP_fix_gaz_emb = True def build_alphabet(self): in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] for word in words: self.word_alphabet.add(word) sentence_gloss = line["babel_gloss"] for word_gloss in sentence_gloss: for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: # for definates in phrase_gloss_EN: # for de_word in definates.split(): self.gloss_alphabet.add(de_word) entitys = line["entity_context"] for entity in entitys: self.entity_alphabet.add(entity) gazs = line["babel_phase"] for gaz in gazs: for item in gaz: self.gaz_alphabet.add(item) labels = line["detection_label"] for label in labels: self.label_alphabet.add(label) print(self.label_alphabet.get_content()) self.word_alphabet_size = self.word_alphabet.size() self.gloss_alphabet_size = self.gloss_alphabet.size() self.entity_alphabet_size = self.entity_alphabet.size() self.gaz_alphabet_size = self.gaz_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.word_alphabet.close() self.gloss_alphabet.close() self.entity_alphabet.close() self.gaz_alphabet.close() self.label_alphabet.close() def generate_instance_Ids(self): #把输入句子变成对应的标号(Id) in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] words_Id = [] for word in words: words_Id.append(self.word_alphabet.get_index(word)) sentence_gloss = line["babel_gloss"] sentence_glosses_Id = [] for word_gloss in sentence_gloss: word_glosses_Id = [] for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] #这是个list final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: word_glosses_Id.append( self.gloss_alphabet.get_index(de_word)) sentence_glosses_Id.append(word_glosses_Id) entitys = line["entity_context"] entitys_Id = [] for entity in entitys: entitys_Id.append(self.entity_alphabet.get_index(entity)) gazs = line["babel_phase"] sentence_gazs_Id = [ ] #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]] for gaz in gazs: word_gazs_Id = [] Ids = [] Lens = [] for item in gaz: Ids.append(self.gaz_alphabet.get_index(item)) Lens.append(len(item.split())) word_gazs_Id = [Ids, Lens] sentence_gazs_Id.append(word_gazs_Id) labels = line["detection_label"] labels_Id = [] for label in labels: labels_Id.append(self.label_alphabet.get_index(label)) self.index_data.append([ words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id, labels_Id ]) def load_pretrain_emb(self, embedding_path): lines = open(embedding_path, 'r', encoding="utf-8").readlines() statistic = lines[0].strip() #开头的两个统计数据:单词数,向量长度 # print(statistic) embedd_dim = int(statistic.split()[1]) embedd_dict = dict() embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)] #填充词对应的向量置为全零 # print(len(embedd_dict["<pad>"])) for line in lines[1:]: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]] return embedd_dict, embedd_dim def norm2one(self, vec): if np.sum(vec) == 0: return vec root_sum_square = np.sqrt(np.sum(np.square(vec))) return vec / root_sum_square def build_pretrain_embedding(self, embedding_path, word_alphabet, embedd_dim=200, norm=True): embedd_dict = dict() if embedding_path != None: # 读取embedding字典 embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path) scale = np.sqrt(3.0 / embedd_dim) pretrain_emb = np.zeros([word_alphabet.size(), embedd_dim]) #pretrain_emb就是重排之后的embedding矩阵 perfect_match = 0 case_match = 0 not_match = 0 for word, index in word_alphabet.get_alphabet().items(): if word in embedd_dict: # print(word,index) # print(len(embedd_dict[word])) if norm: pretrain_emb[index] = self.norm2one(embedd_dict[word]) else: pretrain_emb[index] = embedd_dict[word] perfect_match += 1 elif word.lower() in embedd_dict: if norm: pretrain_emb[index] = self.norm2one( embedd_dict[word.lower()]) else: pretrain_emb[index] = embedd_dict[word.lower()] case_match += 1 else: pretrain_emb[index] = np.random.uniform( -scale, scale, [1, embedd_dim]) not_match += 1 pretrained_size = len(embedd_dict) # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")]) print( "Embedding:\n pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % (pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / word_alphabet.size())) return pretrain_emb, embedd_dim #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度 def generate_embedding(self): self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.word_alphabet) self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gloss_alphabet) self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gaz_alphabet)