class Data: def __init__(self): self.relational_alphabet = Alphabet("Relation", unkflag=False, padflag=False) self.train_data = None self.valid_data = None self.test_data = None def show_data_summary(self): print("DATA SUMMARY START:") print(" Relation Alphabet Size: %s" % self.relational_alphabet.size()) print(" Train Instance Number: %s" % (len(self.train_data))) print(" Valid Instance Number: %s" % (len(self.valid_data))) print(" Test Instance Number: %s" % (len(self.test_data))) print("DATA SUMMARY END.") sys.stdout.flush() def generate_instance(self, args, data_process): tokenizer = BertTokenizer.from_pretrained(args.bert_directory, cache_dir=args.cache_dir) # tokenizer = BertTokenizer.from_pretrained(args.bert_directory) if "train_file" in args: # [[0, sent_id, {"relation": [], "head_start_index": [],"head_end_index": [],"tail_start_index": [],"tail_end_index": []}], ...] for ??_loader self.train_data = data_process(args.train_file, self.relational_alphabet, tokenizer, args.max_seq_len) if "valid_file" in args: self.valid_data = data_process(args.valid_file, self.relational_alphabet, tokenizer, args.max_seq_len) if "test_file" in args: self.test_data = data_process(args.test_file, self.relational_alphabet, tokenizer, args.max_seq_len) self.relational_alphabet.close()
class Data: def __init__(self): self.relational_alphabet = Alphabet("Relation", unkflag=False, padflag=False) self.train_loader = [] self.valid_loader = [] self.test_loader = [] self.weight = {} def show_data_summary(self): print("DATA SUMMARY START:") print(" Relation Alphabet Size: %s" % self.relational_alphabet.size()) print(" Train Instance Number: %s" % (len(self.train_loader))) print(" Valid Instance Number: %s" % (len(self.valid_loader))) print(" Test Instance Number: %s" % (len(self.test_loader))) print("DATA SUMMARY END.") sys.stdout.flush() def generate_instance(self, args, data_process): tokenizer = BertTokenizer.from_pretrained(args.bert_directory, do_lower_case=False) if "train_file" in args: self.train_loader = data_process(args.train_file, self.relational_alphabet, tokenizer) self.weight = copy.deepcopy(self.relational_alphabet.index_num) if "valid_file" in args: self.valid_loader = data_process(args.valid_file, self.relational_alphabet, tokenizer) if "test_file" in args: self.test_loader = data_process(args.test_file, self.relational_alphabet, tokenizer) self.relational_alphabet.close()
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.trans_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_trans = True self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_trans_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.HP_trans_hidden_dim)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() print(items) total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" print("feature_prefix:{}".format(feature_prefix)) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print("Find feature: ", feature_prefix) self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): print("Build alphabet......") in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.HP_trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r') as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r') as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":")) == 2: temp = line.strip().split(":") word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_char_emb = True self.norm_word_emb = False self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) self.word_dict = Word_Trie() self.word_alphabet = Alphabet('word') self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.char_emb_dim = 50 self.word_emb_dim = 50 self.pretrain_char_embedding = None self.pretrain_word_embedding = None self.label_size = 0 def show_data_summary(self): print("DATA SUMMARY:") print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Word alphabet size: %s"%(self.word_alphabet.size())) print(" Char alphabet size: %s"%(self.char_alphabet.size())) print(" Label alphabet size: %s"%(self.label_alphabet.size())) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Norm char emb: %s"%(self.norm_char_emb)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" Raw instance number: %s"%(len(self.raw_texts))) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): self.char_alphabet.open() self.label_alphabet.open() with open(input_file, 'r', encoding="utf-8") as f: for line in f: line = line.strip() if len(line) == 0: continue pair = line.split() char = pair[0] if self.number_normalized: # Mapping numbers to 0 char = normalize_word(char) label = pair[-1] self.label_alphabet.add(label) self.char_alphabet.add(char) self.label_alphabet.close() self.char_alphabet.close() def build_word_file(self, word_file): # build word file,initial word embedding file with open(word_file, 'r', encoding="utf-8") as f: for line in f: word = line.strip().split()[0] if word: self.word_dict.insert(word) print("Building the word dict...") def build_word_alphabet(self, input_file): print("Loading file: " + input_file) self.word_alphabet.open() word_list = [] with open(input_file, 'r', encoding="utf-8") as f: for line in f: line = line.strip() if len(line) > 0: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: for idx in range(len(word_list)): matched_words = self.word_dict.recursive_search(word_list[idx:]) for matched_word in matched_words: self.word_alphabet.add(matched_word) word_list = [] self.word_alphabet.close() print("word alphabet size:", self.word_alphabet.size()) def build_char_pretrain_emb(self, emb_path): print ("Building character pretrain emb...") self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(emb_path, self.char_alphabet, self.norm_char_emb) def build_word_pretrain_emb(self, emb_path): print ("Building word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(emb_path, self.word_alphabet, self.norm_word_emb) def generate_instance_with_words(self, input_file, name): if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz(input_file, self.word_dict, self.char_alphabet, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz(input_file, self.word_dict, self.char_alphabet, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz(input_file, self.word_dict, self.char_alphabet, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz(input_file, self.word_dict, self.char_alphabet, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test/raw instance! Illegal input:%s"%(name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w', encoding="utf-8") sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): # content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy] + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s"%(name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) #self.simi_alphabet = Alphabet('simi') #添加计算相似度词语的信息 self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.gaz_count = {} self.gaz_split = {} self.biword_count = {} self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.HP_use_count = False self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.train_split_index = [] self.dev_split_index = [] self.use_bigram = True self.word_emb_dim = 200 self.biword_emb_dim = 200 self.char_emb_dim = 30 self.gaz_emb_dim = 200 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 #设置词典相似度相关的参数 self.simi_dic_emb = None #设置相似度的嵌入值 self.simi_dic_dim = 10 #设置相似度向量的纬度 self.use_dictionary = False # 设置当前是否使用词典 self.simi_list = [] #存储当前的每个字对应的相似度值 # self.use_gazcount = 'True' ### hyperparameters self.HP_iteration = 60 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 128 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_num_layer = 4 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r', encoding="utf-8").readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding="utf-8").readlines() seqlen = 0 for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[idx + 1].strip().split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) # biword_index = self.biword_alphabet.get_index(biword) self.biword_count[biword] = self.biword_count.get(biword, 0) + 1 for char in word: self.char_alphabet.add(char) #当前句子的长度 seqlen += 1 else: #出现空行则清零 seqlen = 0 #计算各个字表的长度 self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r', encoding="utf-8").readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") #def build_dict_alphabet( def build_gaz_alphabet(self, input_file, count=False): in_lines = open(input_file, 'r', encoding="utf-8").readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: #word_list为当前这个句子的所有字 w_length = len(word_list) entitys = [] #获取到了句子 for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) entitys += matched_entity for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) index = self.gaz_alphabet.get_index(entity) self.gaz_count[index] = self.gaz_count.get( index, 0) ## initialize gaz count #0表示若无想要的关键词则返回0,没有index这一个键值 if count: entitys.sort(key=lambda x: -len(x)) while entitys: longest = entitys[0] longest_index = self.gaz_alphabet.get_index(longest) #最长词的index加1 self.gaz_count[longest_index] = self.gaz_count.get( longest_index, 0) + 1 #把一个词语覆盖的词全部删掉 gazlen = len(longest) for i in range(gazlen): for j in range(i + 1, gazlen + 1): covering_gaz = longest[i:j] if covering_gaz in entitys: entitys.remove(covering_gaz) # print('remove:',covering_gaz) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( self.HP_num_layer, input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.biword_count, self.char_alphabet, self.gaz_alphabet, self.gaz_count, self.gaz_split, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r', encoding='utf-8').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.items(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding='utf-8').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].encode('utf-8').decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[idx + 1].strip().split( )[0].encode('utf-8').decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.items(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r', encoding='utf-8').readlines() for fin in fins: fin = fin.strip().split()[0].encode('utf-8').decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding='utf-8').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].encode('utf-8').decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data(object): def __init__(self, args): super(Data, self).__init__() self.args = args self.data_dir = args.data_dir # './data/gene_term_format_by_sentence.json' self.data_ratio = (0.9, 0.05, 0.05) # total 2000 self.model_save_dir = args.savemodel # './saves/model/' self.output_dir = args.output # './saves/output/' self.data_save_file = args.savedset # './saves/data/dat.pkl' self.pos_as_feature = args.use_pos self.use_elmo = args.use_elmo self.elmodim = args.elmodim self.pos_emb_dim = args.posdim self.useSpanLen = args.use_len self.use_sentence_att = args.use_sent_att self.use_char = True self.ranking = 1 self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.ptag_alphabet = Alphabet('tag') self.label_alphabet = Alphabet('label', label=True) self.seqlabel_alphabet = Alphabet('span_label', label=True) self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.ptag_alphabet_size = 0 self.label_alphabet_size = 0 self.seqlabel_alphabet_size = 0 self.max_sentence_length = 500 self.term_truples = [] self.sent_texts = [] self.chars = [] self.lengths = [] self.ptags = [] self.seq_labels = [] self.word_ids_sent = [] self.char_id_sent = [] self.tag_ids_sent = [] self.label_ids_sent = [] self.seq_labels_ids = [] self.longSpan = True self.shortSpan = True self.termratio = args.term_ratio self.term_span = args.max_length self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None # training self.optimizer = 'Adam' # "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.training = True self.average_batch_loss = True self.evaluate_every = args.evaluate_every # 10 # evaluate every n batches self.print_every = args.print_every self.silence = True self.earlystop = args.early_stop # Embeddings self.word_emb_dir = args.wordemb # './data/glove.6B.100d.txt' # None #'../data/glove.6b.100d.txt' self.char_emb_dir = args.charemb self.word_emb_dim = 50 self.char_emb_dim = 30 self.spamEm_dim = 30 self.norm_word_emb = False self.norm_char_emb = False self.pretrain_word_embedding = None self.pretrain_char_embedding = None # HP self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 100 self.HP_cnn_layer = 2 self.HP_batch_size = 100 self.HP_epoch = 100 self.HP_lr = args.lr self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_l2 = 1e-8 self.HP_dropout = args.dropout self.HP_lstm_layer = 2 self.HP_bilstm = True self.HP_gpu = args.use_gpu # False#True self.HP_term_span = 6 self.HP_momentum = 0 # init data self.build_vocabs() self.all_instances = self.load_data() self.load_pretrain_emb() def build_vocabs(self): '''''' with open(self.data_dir, 'r') as filin: filelines = filin.readlines() for lin_id, lin_cnt in enumerate(filelines): lin_cnt = lin_cnt.strip() line = json.loads(lin_cnt) words = line['words'] tags = line['tags'] terms = line['terms'] for word in words: self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) for tag in tags: self.ptag_alphabet.add(tag) self.sent_texts.append(words) self.ptags.append(tags) assert len(words) == len(tags) self.lengths.append(len(words)) seq_label, termple = self.reformat_label(words, terms) self.seq_labels.append(seq_label) if len(terms) > 0: tmp_terms = [] for itm in termple: tmp_terms.append([itm[0], itm[1], itm[2]]) self.label_alphabet.add(itm[2]) self.term_truples.append(tmp_terms) else: self.term_truples.append([[-1, -1, 'None']]) self.label_alphabet.add('None') for ter in self.seq_labels: for ater in ter: for ate in ater: self.seqlabel_alphabet.add(ate) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.ptag_alphabet_size = self.ptag_alphabet.size() self.seqlabel_alphabet_size = self.seqlabel_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.close_alphabet() def load_pretrain_emb(self): '''''' if self.word_emb_dir: print('Loading pretrained Word Embedding from {}'.format( self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print('Loading pretrained Char Embedding from {}'.format( self.char_emb_dir)) self.pretrain_word_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def load_data(self): '''''' all_instances = [] assert len(self.sent_texts) == len(self.term_truples) == len( self.ptags) for sent_text, ptag, term_truple, seqlabel in zip( self.sent_texts, self.ptags, self.term_truples, self.seq_labels): self.word_ids_sent.append(self.word_alphabet.get_indexs(sent_text)) sent_char = [] sent_char_ids = [] for word in sent_text: char_list = list(word) sent_char.append(char_list) char_ids = self.char_alphabet.get_indexs(char_list) sent_char_ids.append(char_ids) seqLabel_ids = [ self.seqlabel_alphabet.get_indexs(seqlab) for seqlab in seqlabel ] self.seq_labels_ids.append(seqLabel_ids) self.chars.append(sent_char) self.char_id_sent.append(sent_char_ids) self.tag_ids_sent.append(self.ptag_alphabet.get_indexs(ptag)) term_truple = [[ term[0], term[1], self.label_alphabet.get_index(term[2]) ] for term in term_truple] self.label_ids_sent.append(term_truple) all_instances.append([ self.word_ids_sent[-1], sent_char_ids, self.tag_ids_sent[-1], [term_truple, seqLabel_ids], sent_text ]) return all_instances def reformat_label(self, words, terms): label = [[] for i in range(len(words))] termtruple = [] if len(terms) > 0: for term in terms: beg = term[0] end = term[1] lab_ = term[2] termtruple.append((beg, end, lab_)) if beg == end: label[beg].append('S') continue label[beg].append('B') label[end].append('E') if end - beg > 1: for itm in range(beg + 1, end): label[itm].append('I') for slab in label: if slab == []: slab.append('O') label = [list(set(lab)) for lab in label] return label, termtruple def restore(self, data_file): print('Loading data from %s' % data_file) with open(data_file, 'rb') as filin: obj_dict = pkl.load(filin) self.__dict__.update(obj_dict) def save(self, save_file): print('Saving data to %s' % save_file) with open(save_file, 'wb') as filout: pkl.dump(self.__dict__, filout, 2) def close_alphabet(self): self.word_alphabet.close() self.ptag_alphabet.close() self.label_alphabet.close() self.seqlabel_alphabet.close() self.char_alphabet.close() return
class Data: def __init__(self): self.max_sentence_length = 200 self.number_normalized = True self.norm_char_emb = True self.norm_gaz_emb = True self.dataset_name = 'msra' self.tagscheme = "NoSeg" self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', unkflag=False) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.train_ids = [] self.dev_ids = [] self.test_ids = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.char_emb_dim = 100 self.gaz_emb_dim = 100 self.pretrain_char_embedding = None self.pretrain_gaz_embedding = None self.dev_cut_num = 0 self.train_cut_num = 0 self.test_cut_num = 0 self.cut_num = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Dataset name: %s" % self.dataset_name) print(" Tag scheme: %s" % (self.tagscheme)) print(" Max Sentence Length: %s" % self.max_sentence_length) print(" Char alphabet size: %s" % self.char_alphabet.size()) print(" Gaz alphabet size: %s" % self.gaz_alphabet.size()) print(" Label alphabet size: %s" % self.label_alphabet.size()) print(" Char embedding size: %s" % self.char_emb_dim) print(" Gaz embedding size: %s" % self.gaz_emb_dim) print(" Number normalized: %s" % self.number_normalized) print(" Norm char emb: %s" % self.norm_char_emb) print(" Norm gaz emb: %s" % self.norm_gaz_emb) print(" Train instance number: %s" % (len(self.train_ids))) print(" Dev instance number: %s" % (len(self.dev_ids))) print(" Test instance number: %s" % (len(self.test_ids))) if self.cut_num != 0: print(" Train&Dev cut number: %s" % self.cut_num) else: print(" Train cut number: %s" % self.train_cut_num) print(" Dev cut number: %s" % self.dev_cut_num) print(" Test cut number: %s" % self.test_cut_num) print("DATA SUMMARY END.") sys.stdout.flush() def build_gaz_file(self, gaz_file, skip_first_row=False, separator=" "): ## build gaz file,initial read gaz embedding file if gaz_file: with open(gaz_file, 'r') as file: i = 0 for line in tqdm(file): if i == 0: i = i + 1 if skip_first_row: _ = line.strip() continue fin = line.strip().split(separator)[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print("Gaz file is None, load nothing") def fix_alphabet(self): self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_char_pretrain_emb(self, emb_path, skip_first_row=False, separator=" "): print("build char pretrain emb...") self.pretrain_char_embedding, self.char_emb_dim = \ build_pretrain_embedding(emb_path, self.char_alphabet, skip_first_row, separator, self.char_emb_dim, self.norm_char_emb) def build_gaz_pretrain_emb(self, emb_path, skip_first_row=True, separator=" "): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, skip_first_row, separator, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name, random_split=False): texts, ids, cut_num = read_instance(input_file, self.gaz, self.char_alphabet, self.label_alphabet, self.gaz_alphabet, self.number_normalized, self.max_sentence_length) if name == "train": if random_split: random.seed(1) ix = [i for i in range(len(ids))] train_ix = random.sample(ix, int(len(ids) * 0.9)) dev_ix = list(set(ix).difference(set(train_ix))) self.train_ids = [ids[ele] for ele in train_ix] self.dev_ids = [ids[ele] for ele in dev_ix] self.train_texts = [texts[ele] for ele in train_ix] self.dev_texts = [texts[ele] for ele in dev_ix] self.cut_num = cut_num else: self.train_ids = ids self.train_texts = texts self.train_cut_num = cut_num elif name == "dev": self.dev_ids = ids self.dev_texts = texts self.dev_cut_num = cut_num elif name == "test": self.test_ids = ids self.test_texts = texts self.test_cut_num = cut_num else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def get_tag_scheme(self): startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagscheme = "BMES" else: self.tagscheme = "BIO"
class Data: """ 所有数据预处理程序都封装在Data类里面 Data类中封装了Alphabet类,Alphabet类主要功能是word转成id,将id转成词 Alphabet类的构建是通过build_alphabet函数构建的 """ # def __init__(self): self.MAX_SENTENCE_LENGTH = 250 #句子最大长度 self.number_normalized = True #是否将数字归一化 self.norm_word_emb = True #是否将词向量归一化 self.word_alphabet = Alphabet('word') #word的词表与id self.label_alphabet = Alphabet('label', True) #not end "</unk>" #约定标注方式 self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.word_emb_dim = 50 self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 200 self.HP_batch_size = 32 # 1 self.HP_hidden_dim = 200 self.HP_dropout = 0.3 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = True # true self.HP_lr = 0.01 self.HP_lr_decay = 0.05 self.weight_decay = 0.00000005 self.use_clip = False self.HP_clip = 5.0 self.HP_momentum = 0 #控制优化器的一个超参 self.random_seed = 100 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" Use GPU: %s" % (self.HP_gpu)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size - 1)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print("--*--整体参数设定区域--*--") print(" Hyperpara random seed: %s" % (self.random_seed)) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara weight_decay: %s" % (self.weight_decay)) if self.use_clip: print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print("--*--LSTM参数设定区域--*--") print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer_num: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print("DATA SUMMARY END.") sys.stdout.flush() # 构建词典 def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: # True word = normalize_word(word) # 把字符中的数字变成0 label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) self.word_alphabet_size = self.word_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() #从数据中确定标签格式 startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): # 载入预训练词向量 print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def generate_instance(self, input_file, name): # 产生训练开发训练数据 # self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Use bigram: %s"%(self.use_bigram)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Biword alphabet size: %s"%(self.biword_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Gaz alphabet size: %s"%(self.gaz_alphabet.size())) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Biword embedding size: %s"%(self.biword_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Gaz embedding size: %s"%(self.gaz_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm biword emb: %s"%(self.norm_biword_emb)) print(" Norm gaz emb: %s"%(self.norm_gaz_emb)) print(" Norm gaz dropout: %s"%(self.gaz_dropout)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" Raw instance number: %s"%(len(self.raw_texts))) print(" Hyperpara iteration: %s"%(self.HP_iteration)) print(" Hyperpara batch size: %s"%(self.HP_batch_size)) print(" Hyperpara lr: %s"%(self.HP_lr)) print(" Hyperpara lr_decay: %s"%(self.HP_lr_decay)) print(" Hyperpara HP_clip: %s"%(self.HP_clip)) print(" Hyperpara momentum: %s"%(self.HP_momentum)) print(" Hyperpara hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyperpara dropout: %s"%(self.HP_dropout)) print(" Hyperpara lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyperpara bilstm: %s"%(self.HP_bilstm)) print(" Hyperpara GPU: %s"%(self.HP_gpu)) print(" Hyperpara use_gaz: %s"%(self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s"%(self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s"%(self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s"%(self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s"%(old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx+1]) > 2: biword = word + in_lines[idx+1].strip().split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print ("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print ("Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList(word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print ("gaz alphabet size:", self.gaz_alphabet.size()) def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print ("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print ("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding(emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print ("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding(emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance(input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz(input_file, self.gaz,self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz(input_file, self.gaz, self.word_alphabet,self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy]+ " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s"%(name, output_file))
class JointData: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.word_alphabet = Alphabet('word') self.label = [ "O", "B-A", "I-A", "B-O", "I-O", "B-E", "I-E", "B-T", "I-T", "B-C", "I-C" ] self.label_alphabet = Alphabet('label', True) self.sentence_type_alphabet = Alphabet('sentence', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.word_emb_file = None self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.use_pre_trained_model = None self.word_alphabet_size = 0 self.opinion_label_alphabet_size = 0 self.evidence_label_alphabet_size = 0 self.sentence_alphabet_size = 0 self.word_emb_dim = 50 self.lstm_input_size = 50 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_hidden_dim = 200 self.HP_attention_query_input_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): logger.info("++" * 50) logger.info("DATA SUMMARY START:") logger.info(" I/O:") logger.info(" Tag scheme: %s" % (self.tagScheme)) logger.info(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) logger.info(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) logger.info(" Number normalized: %s" % (self.number_normalized)) logger.info(" Word alphabet size: %s" % (self.word_alphabet_size)) logger.info(" Opinion Label alphabet size: %s" % (self.opinion_label_alphabet_size)) logger.info(" Evidence Label alphabet size: %s" % (self.evidence_label_alphabet_size)) logger.info(" Word embedding dir: %s" % (self.word_emb_dir)) logger.info(" Word embedding size: %s" % (self.word_emb_dim)) logger.info(" Norm word emb: %s" % (self.norm_word_emb)) logger.info(" Train file directory: %s" % (self.train_dir)) logger.info(" Dev file directory: %s" % (self.dev_dir)) logger.info(" Test file directory: %s" % (self.test_dir)) logger.info(" Raw file directory: %s" % (self.raw_dir)) logger.info(" Dset file directory: %s" % (self.dset_dir)) logger.info(" Model file directory: %s" % (self.model_dir)) logger.info(" Loadmodel directory: %s" % (self.load_model_dir)) logger.info(" Decode file directory: %s" % (self.decode_dir)) logger.info(" Train instance number: %s" % (len(self.train_texts))) logger.info(" Dev instance number: %s" % (len(self.dev_texts))) logger.info(" Test instance number: %s" % (len(self.test_texts))) logger.info(" Raw instance number: %s" % (len(self.raw_texts))) logger.info(" " + "++" * 20) logger.info(" Model Network:") logger.info(" Model use_crf: %s" % (self.use_crf)) logger.info(" Model word extractor: %s" % (self.word_feature_extractor)) logger.info(" " + "++" * 20) logger.info(" Training:") logger.info(" Optimizer: %s" % (self.optimizer)) logger.info(" Iteration: %s" % (self.HP_iteration)) logger.info(" BatchSize: %s" % (self.HP_batch_size)) logger.info(" Average batch loss: %s" % (self.average_batch_loss)) logger.info(" " + "++" * 20) logger.info(" Hyperparameters:") logger.info(" Hyper lr: %s" % (self.HP_lr)) logger.info(" Hyper lr_decay: %s" % (self.HP_lr_decay)) logger.info(" Hyper HP_clip: %s" % (self.HP_clip)) logger.info(" Hyper momentum: %s" % (self.HP_momentum)) logger.info(" Hyper l2: %s" % (self.HP_l2)) logger.info(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) logger.info(" Hyper attention_input: %s" % (self.HP_attention_query_input_dim)) logger.info(" Hyper dropout: %s" % (self.HP_dropout)) logger.info(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) logger.info(" Hyper bilstm: %s" % (self.HP_bilstm)) logger.info(" Hyper GPU: %s" % (self.HP_gpu)) logger.info("DATA SUMMARY END.") logger.info("++" * 50) sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r', encoding='utf-8').readlines() for line in in_lines: pairs = line.strip().split() if len(pairs) == 2: word = pairs[0] if sys.version_info[0] < 3: word = word.decode('utf-8') if self.number_normalized: word = normalize_word(word) self.word_alphabet.add(word) if len(pairs) == 1 and not line.strip() == "<end>": sentence_type = line.strip() self.sentence_type_alphabet.add(sentence_type) for l in self.label: self.label_alphabet.add(l) self.word_alphabet_size = self.word_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.sentence_alphabet_size = self.sentence_type_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.label_alphabet.close() self.sentence_type_alphabet.close() def build_pretrain_emb(self): if self.word_emb_dir: emb_path = os.path.join(self.word_emb_dir, self.word_emb_file) logger.info("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, emb_path)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.label_alphabet, self.sentence_type_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def read_config(self, args): ## read data: self.train_dir = args.train_dir self.dev_dir = args.dev_dir self.test_dir = args.test_dir self.model_dir = args.model_dir self.word_emb_dir = args.word_emb_dir self.word_emb_file = args.word_emb_file self.norm_word_emb = str2bool(args.norm_word_emb) self.use_pre_trained_model = str2bool(str(args.use_pre_trained_model)) self.number_normalized = str2bool(args.number_normalized) self.seg = args.seg self.lstm_input_size = int(args.lstm_input_size) self.use_crf = str2bool(str(args.use_crf)) self.word_feature_extractor = args.word_seq_feature ## read training setting: self.optimizer = args.optimizer self.average_batch_loss = args.ave_batch_loss self.status = args.status self.HP_iteration = int(args.iteration) self.HP_batch_size = int(args.batch_size) self.HP_hidden_dim = int(args.hidden_dim) self.HP_attention_query_input_dim = int( args.attention_query_input_size) self.HP_dropout = float(args.dropout) self.HP_lstm_layer = int(args.lstm_layer) self.HP_bilstm = args.bilstm self.HP_gpu = args.gpu self.HP_lr = float(args.learning_rate) self.HP_lr_decay = float(args.lr_decay) self.HP_momentum = float(args.momentum) self.HP_l2 = float(args.l2) self.clip_grad = float(args.clip_grad) self.label_embedding_scale = float(args.label_embedding_scale) self.num_attention_head = int(args.num_attention_head) self.whether_clip_grad = str2bool(args.whether_clip_grad)
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.norm_word_emb = True self.norm_biword_emb = True self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.pos_alphabet = Alphabet('pos') self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = False self.word_emb_dim = 50 self.biword_emb_dim = 50 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.label_alphabet_size = 0 # hyperparameters self.HP_iteration = 100 self.HP_batch_size = 16 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.2 self.HP_lstmdropout = 0 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 # attention self.tencent_word_embed_dim = 200 self.pos_embed_dim = 200 self.cross_domain = False self.cross_test = False self.use_san = False self.use_cnn = False self.use_attention = True self.pos_to_idx = {} self.external_pos = {} self.token_replace_prob = {} self.use_adam = False self.use_bert = False self.use_warmup_adam = False self.use_sgd = False self.use_adadelta = False self.use_window = True self.mode = 'train' self.use_tencent_dic = False # cross domain file self.computer_file = "" self.finance_file = "" self.medicine_file = "" self.literature_file = "" def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Char alphabet size: %s" % (self.word_alphabet_size)) print(" BiChar alphabet size: %s" % (self.biword_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Char embedding size: %s" % (self.word_emb_dim)) print(" BiChar embedding size: %s" % (self.biword_emb_dim)) print(" Norm char emb: %s" % (self.norm_word_emb)) print(" Norm bichar emb: %s" % (self.norm_biword_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Cross domain: %s" % self.cross_domain) print(" Hyperpara use window: %s" % self.use_window) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in range(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split('\t') # word = pairs[0].decode('utf-8') word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1][0] + '-SEG' self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: # biword = word + in_lines[idx + 1].strip('\t').split()[0].decode('utf-8') biword = word + in_lines[idx + 1].strip('\t').split()[0] else: biword = word + NULLKEY self.biword_alphabet.add(biword) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): print("build word pretrain emb...") self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print("build biword pretrain emb...") self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_word_vec_100(self): self.pretrain_word_embedding, self.pretrain_biword_embedding = self.get_embedding( ) self.word_emb_dim, self.biword_emb_dim = 100, 100 # get pre-trained embeddings def get_embedding(self, size=100): fname = 'data/wordvec_' + str(size) print("build pretrain word embedding from: ", fname) word_init_embedding = np.zeros(shape=[self.word_alphabet.size(), size]) bi_word_init_embedding = np.zeros( shape=[self.biword_alphabet.size(), size]) pre_trained = gensim.models.KeyedVectors.load(fname, mmap='r') # pre_trained_vocab = set([unicode(w.decode('utf8')) for w in pre_trained.vocab.keys()]) pre_trained_vocab = set([w for w in pre_trained.vocab.keys()]) c = 0 for word, index in self.word_alphabet.iteritems(): if word in pre_trained_vocab: word_init_embedding[index] = pre_trained[word] else: word_init_embedding[index] = np.random.uniform(-0.5, 0.5, size) c += 1 for word, index in self.biword_alphabet.iteritems(): bi_word_init_embedding[index] = ( word_init_embedding[self.word_alphabet.get_index(word[0])] + word_init_embedding[self.word_alphabet.get_index(word[1])]) / 2 # word_init_embedding[word2id[PAD]] = np.zeros(shape=size) # bi_word_init_embedding[] print('oov character rate %f' % (float(c) / self.word_alphabet.size())) return word_init_embedding, bi_word_init_embedding def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( input_file, self.word_alphabet, self.biword_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy] + "\t" + predict_results[idx][idy][0] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_char_emb = True self.norm_bichar_emb = True self.norm_gaz_emb = False self.use_single = False self.char_alphabet = Alphabet('char') self.bichar_alphabet = Alphabet('bichar') self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower, self.use_single) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bichar = False self.char_emb_dim = 50 self.bichar_emb_dim = 50 self.gaz_emb_dim = 50 self.posi_emb_dim = 30 self.gaz_dropout = 0.5 self.pretrain_char_embedding = None self.pretrain_bichar_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.char_alphabet_size = 0 self.bichar_alphabet_size = 0 self.character_alphabet_size = 0 self.label_alphabet_size = 0 # hyper parameters self.HP_iteration = 100 self.HP_batch_size = 1 # self.HP_char_hidden_dim = 50 # int. Character hidden vector dimension for character sequence layer. self.HP_hidden_dim = 200 # int. Char hidden vector dimension for word sequence layer. self.HP_dropout = 0.5 # float. Dropout probability. self.HP_lstm_layer = 1 # int. LSTM layer number for word sequence layer. self.HP_bilstm = True # boolen. If use bidirection lstm for word seuquence layer. self.HP_gpu = False # Word level LSTM models (e.g. char LSTM + word LSTM + CRF) would prefer a `lr` around 0.015. # Word level CNN models (e.g. char LSTM + word CNN + CRF) would prefer a `lr` around 0.005 and with more iterations. self.HP_lr = 0.015 self.HP_lr_decay = 0.05 # float. Learning rate decay rate, only works when optimizer=SGD. self.HP_clip = 1.0 # float. Clip the gradient which is larger than the setted number. self.HP_momentum = 0 # float. Momentum self.HP_use_posi = False self.HP_num_layer = 4 self.HP_rethink_iter = 2 self.model_name = 'CNN_model' self.posi_alphabet_size = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bichar)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Bichar alphabet size: %s" % (self.bichar_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.char_emb_dim)) print(" Bichar embedding size: %s" % (self.bichar_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_char_emb)) print(" Norm bichar emb: %s" % (self.norm_bichar_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() start_s = False start_b = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): start_s = True elif "B-" in label.upper(): start_b = True if start_b: if start_s: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) # "陈 B-PER" def build_alphabet(self, input_file): if input_file is None or not os.path.isfile(input_file): # print('[' + sys._getframe().f_code.co_name + '] file ' + str(input_file) + "can not be found or is not a file address") return with codecs.open(input_file, 'r', 'utf-8') as fr: in_lines = fr.readlines() seqlen = 0 for idx in range(len(in_lines)): line = in_lines[idx] # '陈 B-PER\n' # 行不空 则加入label word bichar char if len(line) > 2: # if sequence labeling data format i.e. CoNLL 2003 pairs = line.strip().split() # list ['陈','B-PER'] char = pairs[0] # '陈' if self.number_normalized: # 数字转0 char = normalize_char(char) label = pairs[-1] # "B-PER" # build feature alphabet self.label_alphabet.add(label) self.char_alphabet.add(char) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: bichar = char + in_lines[idx + 1].strip().split()[0] # 陈元 else: bichar = char + NULLKEY self.bichar_alphabet.add(bichar) seqlen += 1 else: self.posi_alphabet_size = max(seqlen, self.posi_alphabet_size) seqlen = 0 self.char_alphabet_size = self.char_alphabet.size() self.bichar_alphabet_size = self.bichar_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() start_s = False start_b = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): start_s = True elif "B-" in label.upper(): start_b = True if start_b: if start_s: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): # build gaz file, initial read gaz embedding file if gaz_file: with codecs.open(gaz_file, 'r', 'utf-8') as fr: fins = fr.readlines() for fin in fins: fin = fin.strip().split()[0] if fin: self.gaz.insert(fin, "one_source") print("Load gaz file: ", gaz_file, " total size:", self.gaz.size()) else: print('[' + sys._getframe().f_code.co_name + '] ' + "Gaz file is None, load nothing") def build_gaz_alphabet(self, input_file): if input_file is None or not os.path.isfile(input_file): # print('[' + sys._getframe().f_code.co_name + '] file ' + str(input_file) + "can not be found or is not a file address") return with codecs.open(input_file, 'r', 'utf-8') as fr: in_lines = fr.readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0] if self.number_normalized: word = normalize_char(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print("gaz alphabet size:", self.gaz_alphabet.size()) # Alphabet def fix_alphabet(self): self.char_alphabet.close() # alphabet.keep_growing=False self.bichar_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_char_pretrain_emb(self, emb_path): print("build char pretrain emb...") self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def build_bichar_pretrain_emb(self, emb_path): print("build bichar pretrain emb...") self.pretrain_bichar_embedding, self.bichar_emb_dim = build_pretrain_embedding( emb_path, self.bichar_alphabet, self.bichar_emb_dim, self.norm_bichar_emb) def build_gaz_pretrain_emb(self, emb_path): print("build gaz pretrain emb...") self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.char_alphabet, self.bichar_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def generate_instance_with_gaz_2(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz_2( self.HP_num_layer, input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz_3(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz_3( input_file, self.gaz, self.char_alphabet, self.bichar_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.use_single) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data(object): def __init__(self, data_config_file, alphabet_path, if_train=True): if if_train: with open(data_config_file, 'r') as rf: self.data_config = yaml.load(rf, Loader=yaml.FullLoader) # init data file mode = self.data_config['mode'] self.data_file = os.path.join(ROOT_PATH, self.data_config['data'][mode]) # init ac tree specific_words_file = os.path.join( ROOT_PATH, self.data_config['specific_words_file']) self.trees = Trees.build_trees(specific_words_file) # init alphabet self.char_alphabet = Alphabet('char') self.intent_alphabet = Alphabet('intent') self.label_alphabet = Alphabet('label', label=True) self.char_alphabet_size, self.intent_alphabet_size, self.label_alphabet_size = -1, -1, -1 # pad length self.char_max_length = self.data_config['char_max_length'] # read data file with open(self.data_file, 'r') as rf: self.corpus = rf.readlines() self.build_alphabet(alphabet_path) self.texts, self.ids = self.read_instance() self.train_texts, self.train_ids, self.dev_texts, self.dev_ids, self.test_texts, self.test_ids = self.sample_split( ) else: # inference use self.char_alphabet = Alphabet('char', keep_growing=False) self.intent_alphabet = Alphabet('intent', keep_growing=False) self.label_alphabet = Alphabet('label', label=True, keep_growing=False) def build_alphabet(self, alphabet_path): for line in self.corpus: line = ast.literal_eval(line) char, char_label, seg_list, intent = line['char'], line[ 'char_label'], line['word'], line['intent'] for word in seg_list: # lexicon lexi_feat = [] for lexi_type, lb in self.trees.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) # 抽象成一个字符 self.char_alphabet.add(lexi_feat) # char for c in char: self.char_alphabet.add(normalize_word(c)) # intent self.intent_alphabet.add(intent) # label for label in char_label: self.label_alphabet.add(label) # alphabet_size self.char_alphabet_size = self.char_alphabet.size() self.intent_alphabet_size = self.intent_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() # close alphabet self.fix_alphabet() # write alphabet: if not os.path.exists(alphabet_path): with open(alphabet_path, 'wb') as wbf: pickle.dump(self.char_alphabet.instance2index, wbf) pickle.dump(self.intent_alphabet.instance2index, wbf) pickle.dump(self.label_alphabet.instance2index, wbf) pickle.dump(self.label_alphabet.instances, wbf) pickle.dump(self.char_alphabet_size, wbf) pickle.dump(self.intent_alphabet_size, wbf) pickle.dump(self.label_alphabet_size, wbf) def read_instance(self): """ 这里读取完整读数据,不做截断,functions.py中做截断 :return: """ texts, ids = [], [] for idx, line in enumerate(self.corpus): line = ast.literal_eval(line) intent_id_list = [] # word:'0010000' -> 合并成一个标签 seq_char, seq_char_id_list, seq_label, seq_label_id_list = [], [], [], [] char, char_label, seg_list, intent = line['char'], line[ 'char_label'], line['word'], line['intent'] # 存储one-hot形式的属性特征 lexicons = [] # 记录字符的index word_indices = [] start = 0 flag = True # 判断跳至上一循环 for word in seg_list: if flag is True: end = start + len(word) lexi_feat = [] for lexi_type, lb in self.trees.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) lexicons.append(lexi_feat) word_indices.append([start, end]) # char # '0010000' if '1' in lexi_feat: seq_char.append(lexi_feat) seq_char_id_list.append( self.char_alphabet.get_index(lexi_feat)) # ["B-room", "I-room", "I-room"] specific_word_label = char_label[start:end] tmp_label = [ swl.split('-')[-1] for swl in specific_word_label ] if len(set(tmp_label)) > 1: # 判断是否过滤该条数据 print('Be filtered: %s' % line['text'], word, tmp_label) flag = False else: assert len(set(tmp_label)) == 1 if tmp_label[0] == 'O': tmp_label = 'O' else: tmp_label = 'B' + '-' + tmp_label[0] seq_label += [tmp_label] seq_label_id_list += [ self.label_alphabet.get_index(tmp_label) ] # '0000000' else: for c in word: seq_char.append(c) seq_char_id_list.append( self.char_alphabet.get_index( normalize_word(c))) seq_label += char_label[start:end] seq_label_id_list += [ self.label_alphabet.get_index(cl) for cl in char_label[start:end] ] start = end else: break # 跳至下一个corpus intent_id_list.append(self.intent_alphabet.get_index(intent)) if idx % 10000 == 0: logger.info('read instance : %s' % idx) if flag is True: # text, char, intent, sequence_label texts.append([line['text'], seq_char, intent, seq_label]) ids.append( [seq_char_id_list, intent_id_list, seq_label_id_list]) # 新形式的corpus的保存下来,方便查bug output_path = self.data_config['data']['output'] with open(output_path, 'w') as wf: for text in texts: line_data = dict() line_data['text'] = text[0] line_data['char'] = text[1] line_data['intent'] = text[2] line_data['char_label'] = text[-1] wf.write(json.dumps(line_data, ensure_ascii=False) + '\n') return texts, ids def fix_alphabet(self): self.char_alphabet.close() self.intent_alphabet.close() self.label_alphabet.close() # data sampling def sample_split(self): sampling_rate = self.data_config['sampling_rate'] indexes = list(range(len(self.ids))) random.shuffle(indexes) shuffled_texts = [self.texts[i] for i in indexes] shuffled_ids = [self.ids[i] for i in indexes] logger.info('Top 10 shuffled indexes: %s' % indexes[:10]) n = int(len(shuffled_ids) * sampling_rate) dev_texts, dev_ids = shuffled_texts[:n], shuffled_ids[:n] test_texts, test_ids = shuffled_texts[n:2 * n], shuffled_ids[n:2 * n] train_texts, train_ids = shuffled_texts[2 * n:], shuffled_ids[2 * n:] return train_texts, train_ids, dev_texts, dev_ids, test_texts, test_ids