def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:', dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16, dev2016, devtest2016, test2016] for fname in files: fname_ext = os.path.join(ddir, fname) tid, topics, tweets, sentiments = load_data(fname_ext, topic_alphabet) print "Number of tweets:", len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets, topics, topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump( topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def main(): HOME_DIR = "semeval_parsed" input_fname = '200M' outdir = HOME_DIR + '_' + input_fname print outdir if not os.path.exists(outdir): os.makedirs(outdir) ddir = 'semeval/binary' train16 = "task-BD-train-2016.tsv" dev2016 = "task-BD-dev-2016.tsv" devtest2016 = "task-BD-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-BD.txt" fname_vocab = os.path.join(outdir, 'vocab.pickle') alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.fid print "alphabet", len(alphabet) print 'dummy_word:',dummy_word_idx topic_alphabet = Alphabet(start_feature_id=0) topic_alphabet.add('UNKNOWN_TOPIC_IDX') dummy_topic_idx = topic_alphabet.fid print "Loading Semeval Data" #save semeval tweets seperate files = [train16,dev2016,devtest2016,test2016] for fname in files: fname_ext = os.path.join(ddir,fname) tid,topics,tweets, sentiments = load_data(fname_ext,topic_alphabet) print "Number of tweets:",len(tweets) tweet_idx = pts.convert2indices(tweets, alphabet, dummy_word_idx) topic_idx = get_topic_indices(tweets,topics,topic_alphabet) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) np.save(os.path.join(outdir, '{}.topics.npy'.format(basename)), topic_idx) cPickle.dump(topic_alphabet, open(os.path.join(outdir, 'vocab_{}.pickle'.format('topic')), 'w'))
def build_alphabet(input_file, train_files, test_files): event_alphabet = Alphabet("eventid") # deal with the train file for train_day in train_files: train_path = input_file + "/" + train_day files = os.listdir(train_path) #得到文件夹下的所有文件名称 for file in files: #遍历mid文件夹 in_lines = open(train_path + "/" + file, 'r', encoding='utf-8').readlines() for idx in range(len(in_lines)): eventid = in_lines[idx].split('\t')[0] event_alphabet.add(eventid) # deal with the test file for test_day in test_files: test_path = input_file + "/" + test_day files = os.listdir(test_path) #得到文件夹下的所有文件名称 for file in files: #遍历mid文件夹 in_lines = open(test_path + "/" + file, 'r', encoding='utf-8').readlines() for idx in range(len(in_lines)): eventid = in_lines[idx].split('\t')[0] event_alphabet.add(eventid) return event_alphabet
def main(): outdir = "preprocessed_data" out_file = 'vocal_wembext.pickle' fname, delimiter, ndim = ( 'embeddings/smiley_tweets_embedding_multilingual300M', ' ', 52) word2vec = load_glove_vec(fname, {}, delimiter, ndim) alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') alphabet.add('DUMMY_WORD_IDX') dummy_word_idx = alphabet.get('DUMMY_WORD_IDX') for token in word2vec.keys(): alphabet.add(token) print 'Alphabet before purge:', len(alphabet) cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb'))
def main(): data_dir = 'tweets/hashtag_top100_smileys_tweets_{}.gz' output_dir_tweets = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.tweets.npy' output_dir_hashtags = 'parsed_tweets/hashtag_top100_smiley_tweets_{}.hashtags.npy' outdir = 'parsed_tweets' alphabet_words = Alphabet(start_feature_id=0) alphabet_words.add('UNKNOWN_WORD_IDX') alphabet_words.add('DUMMY_WORD_IDX') dummy_word_idx = DUMMY_WORD_IDX alphabet_hashtags = Alphabet(start_feature_id=0) alphabet_hashtags.add('UNKNOWN_HASHTAG_IDX') inp = 'train' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) inp = 'test' store_file(data_dir.format(inp),output_dir_tweets.format(inp),alphabet_words,alphabet_hashtags,dummy_word_idx,output_dir_hashtags.format(inp)) cPickle.dump(alphabet_words, open(os.path.join(outdir, 'vocab_words.pickle'), 'w')) cPickle.dump(alphabet_hashtags, open(os.path.join(outdir, 'vocab_hashtags.pickle'), 'w'))
#stoplist.update(punct) # merge inputs to compute word frequencies _, ext = os.path.splitext(os.path.basename(train)) all_fname = "/tmp/trec-merged" + ext files = ' '.join([train, dev, test]) subprocess.call("/bin/cat {} > {}".format(files, all_fname), shell=True) unique_questions, qids, questions, answers, labels = load_data(all_fname, resample = False) docs = answers + unique_questions word2dfs = compute_dfs(docs) print word2dfs.items()[:10] # map words to ids alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') add_to_vocab(answers, alphabet) add_to_vocab(questions, alphabet) basename = os.path.basename(train) cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w')) print "alphabet size=", len(alphabet) # dump embedding file dummy_word_idx = alphabet.fid dump_embedding(outdir, 'embeddings/aquaint+wiki.txt.gz.ndim=50.bin', alphabet) # summarize max sentense length q_max_sent_length = max(map(lambda x: len(x), questions)) a_max_sent_length = max(map(lambda x: len(x), answers)) print 'q_max_sent_length', q_max_sent_length print 'a_max_sent_length', a_max_sent_length
def main(args): if not os.path.exists(args.test_eval_dir): os.makedirs(args.test_eval_dir) if not os.path.exists(args.eval_dir): os.makedirs(args.eval_dir) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) #### print config #### print(args) #### add label #### label_alphabet = Alphabet('label', True) label_alphabet.add("O") label_alphabet.add("B-T") label_alphabet.add("I-T") label_alphabet.add("B-P") label_alphabet.add("I-P") # read data print("Loading data....") datasets = torch.load(args.data) train_set = datasets["train"] test_set = datasets["test"] train_dataloader = read_data(train_set, "train", args.batchSize) eval_dataloader = read_data(test_set, "test", args.batchSize) #### load BERT config #### print("Loading BERT config....") bert_config = BertConfig.from_json_file(args.bert_json_dir) #### defined model #### model = opinionMining(args, bert_config, label_alphabet) if args.mode == "test": assert args.test_model != "" model = torch.load(args.test_model) test_start = time.time() # evaluate RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate( eval_dataloader, test_set, model, args.test_eval_dir + "/test_output", args) test_finish = time.time() test_cost = test_finish - test_start print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0)) print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (RP, RR, RF)) print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (TP, TR, TF)) print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (OP, OR, OF)) else: print("Loading model from pretrained checkpoint: " + args.bert_checkpoint_dir) model = bert_load_state_dict( model, torch.load(args.bert_checkpoint_dir, map_location='cpu')) #### define optimizer #### num_train_steps = int(len(train_set) / args.batchSize * args.iteration) param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if "bert" in n], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if "bert" not in n], 'lr': args.lr_rate, 'weight_decay': 0.01 }] optimizer_grouped_parameters_r = [{ 'params': [p for n, p in param_optimizer if "bert" in n], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if "relation" in n], 'lr': args.R_lr_rate, 'weight_decay': 0.01 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-05, warmup=0.1, t_total=num_train_steps) optimizer_r = BertAdam(optimizer_grouped_parameters_r, lr=2e-05, warmup=0.1, t_total=num_train_steps) #### train #### print("start training......") best_Score = -10000 lr = args.lr_rate for idx in range(args.iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, args.iteration)) if idx > 10: lr = lr * args.lr_decay print(lr) optimizer.param_groups[1]["lr"] = lr optimizer_r.param_groups[1]["lr"] = lr sample_loss = 0 total_loss = 0 right_target_token = 0 whole_target_token = 0 right_relation_token = 0 whole_relation_token = 0 model.train() model.zero_grad() for step, batch in enumerate(train_dataloader): if args.ifgpu: batch = tuple(t.cuda() for t in batch) all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels = batch max_seq_len = torch.max(torch.sum(all_input_mask, dim=1)) all_input_ids = all_input_ids[:, :max_seq_len].contiguous() all_input_mask = all_input_mask[:, :max_seq_len].contiguous() all_segment_ids = all_segment_ids[:, :max_seq_len].contiguous() all_relations = all_relations[:, :max_seq_len, : max_seq_len].contiguous() all_labels = all_labels[:, :max_seq_len].contiguous() tloss, rloss, targetPredict, relationPredict = model.neg_log_likelihood_loss( all_input_ids, all_segment_ids, all_labels, all_relations, all_input_mask) # check right number targetRight, targetWhole = targetPredictCheck( targetPredict, all_labels, all_input_mask) relationRight, relationWhole = relationPredictCheck( relationPredict, all_relations) # cal right and whole label number right_target_token += targetRight whole_target_token += targetWhole right_relation_token += relationRight whole_relation_token += relationWhole # cal loss sample_loss += rloss.data[0] + tloss.data[0] total_loss += rloss.data[0] + tloss.data[0] # print train info if step % 20 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f" % (step * args.batchSize, temp_cost, sample_loss, right_target_token, whole_target_token, (right_target_token + 0.) / whole_target_token, right_relation_token, whole_relation_token, (right_relation_token + 0.) / whole_relation_token)) if sample_loss > 1e8 or str(sample_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) sys.stdout.flush() sample_loss = 0 if step % 2 == 0: loss = 9 * rloss + tloss # loss.backward() optimizer.step() optimizer.zero_grad() else: rloss.backward() optimizer_r.step() optimizer_r.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print( " Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f" % (step * args.batchSize, temp_cost, sample_loss, right_target_token, whole_target_token, (right_target_token + 0.) / whole_target_token, right_relation_token, whole_relation_token, (right_relation_token + 0.) / whole_relation_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, len(train_set) / epoch_cost, total_loss)) print("totalloss:", total_loss) if total_loss > 1e8 or str(total_loss) == "nan": print( "ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT...." ) exit(1) # evaluate RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate( eval_dataloader, test_set, model, args.eval_dir + "/test_output_" + str(idx), args) test_finish = time.time() test_cost = test_finish - epoch_finish current_Score = RF print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0)) print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (RP, RR, RF)) print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (TP, TR, TF)) print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (OP, OR, OF)) if current_Score > best_Score: print( "Exceed previous best f score with target f: %.4f and opinion f: %.4f and relation f: %.4f" % (TF, OF, RF)) model_name = args.model_dir + "/modelFinal.model" print("Save current best model in file:", model_name) torch.save(model, model_name) best_Score = current_Score gc.collect()
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" self.char_features = "LSTM" ## "LSTM"/"CNN" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_average_batch_loss = False self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 50 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyper iteration: %s" % (self.HP_iteration)) print(" Hyper batch size: %s" % (self.HP_batch_size)) print(" Hyper average batch: %s" % (self.HP_average_batch_loss)) print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def extend_word_char_alphabet(self, input_file_list): old_word_size = self.word_alphabet_size old_char_size = self.char_alphabet_size for input_file in input_file_list: in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() print("Extend word/char alphabet finished!") print(" old word:%s -> new word:%s" % (old_word_size, self.word_alphabet_size)) print(" old char:%s -> new char:%s" % (old_char_size, self.char_alphabet_size)) for input_file in input_file_list: print(" from file:%s" % (input_file)) def build_alphabet(self, input_file): in_lines_string = open(input_file + ".string.txt", 'r').readlines() in_lines_label = open(input_file + ".label.txt", 'r').readlines() for line_string, line_label in zip(in_lines_string, in_lines_label): print(line_label) print(line_string) line_label = line_label[:-1].split(',') line_string = line_string[:-1] assert len(line_label) == len(line_string) for i in range(len(line_label)): self.label_alphabet.add(line_label[i]) self.word_alphabet.add(line_string[i]) self.char_alphabet.add("*") self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path): self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_char_pretrain_emb(self, emb_path): self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.number_normalized = True self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.feature_name2id = {} self.label_alphabet = Alphabet('label', True) self.tagScheme = "BMES" ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.word_emb_dir = None self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.nbest = None self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_gpu = False self.HP_lr = 0.015 self.HP_l2 = 1e-8 # both self.full_data = False self.tune_wordemb = False # relation self.max_seq_len = 500 self.pad_idx = 0 self.sent_window = 3 # self.output =None self.unk_ratio = 1 self.seq_feature_size = 256 self.re_feature_name = [] self.re_feature_name2id = {} self.re_feature_alphabets = [] self.re_feature_num = len(self.re_feature_alphabets) self.re_feat_config = None self.re_feature_emb_dims = [] self.re_feature_alphabet_sizes = [] self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] self.patience = 10 # self.pretrained_model_dir = None def copy_alphabet(self, other): self.word_alphabet = copy.deepcopy(other.word_alphabet) self.char_alphabet = copy.deepcopy(other.char_alphabet) for feature_alphabet in other.feature_alphabets: self.feature_alphabets.append(copy.deepcopy(feature_alphabet)) self.label_alphabet = copy.deepcopy(other.label_alphabet) self.feature_name = copy.deepcopy(other.feature_name) self.feature_alphabets = copy.deepcopy(other.feature_alphabets) self.feature_num = len(self.feature_alphabets) self.feature_name2id = copy.deepcopy(other.feature_name2id) self.feature_alphabet_sizes = copy.deepcopy( other.feature_alphabet_sizes) self.feature_emb_dims = copy.deepcopy(other.feature_emb_dims) for re_feature_alphabet in other.re_feature_alphabets: self.re_feature_alphabets.append( copy.deepcopy(re_feature_alphabet)) self.re_feature_name = copy.deepcopy(other.re_feature_name) self.re_feature_name2id = copy.deepcopy(other.re_feature_name2id) self.re_feature_alphabets = copy.deepcopy(other.re_feature_alphabets) self.re_feature_num = len(self.re_feature_alphabets) self.re_feature_emb_dims = copy.deepcopy(other.re_feature_emb_dims) self.re_feature_alphabet_sizes = copy.deepcopy( other.re_feature_alphabet_sizes) def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper NBEST: %s" % (self.nbest)) print(" full data: %s" % (self.full_data)) print(" Tune word embeddings: %s" % (self.tune_wordemb)) print(" max sequence length: %s" % (self.max_seq_len)) print(" pad index: %s" % (self.pad_idx)) print(" patience: %s" % (self.patience)) print(" sentence window: %s" % (self.sent_window)) # print(" Output directory: %s" % (self.output)) print(" The ratio using negative instnaces 0~1: %s" % (self.unk_ratio)) print(" Size of seqeuence feature representation: %s" % (self.seq_feature_size)) print(" RE FEATURE num: %s" % (self.re_feature_num)) for idx in range(self.re_feature_num): print(" Fe: %s alphabet size: %s" % (self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx])) print(" Fe: %s embedding size: %s" % (self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx])) print(" RE Train instance number: %s" % (len(self.re_train_Y))) print(" RE Dev instance number: %s" % (len(self.re_dev_Y))) print(" RE Test instance number: %s" % (len(self.re_test_Y))) # print(" pretrained_model_dir: %s" % (self.pretrained_model_dir)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): feature_prefix = '[Cap]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 0 feature_prefix = '[POS]' self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = 1 self.feature_num = len(self.feature_alphabets) self.feature_emb_dims = [20] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] def build_alphabet(self, documents): for doc in documents: for sentence in doc: for token in sentence: word = token['word'] if self.number_normalized: word = normalize_word(word) label = token['label'] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet self.feature_alphabets[0].add(token['cap']) self.feature_alphabets[1].add(token['pos']) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def open_alphabet(self): self.word_alphabet.open() self.char_alphabet.open() # label not open # self.label_alphabet.open() for idx in range(self.feature_num): self.feature_alphabets[idx].open() def initial_re_feature_alphabets(self): id = 0 for k, v in self.re_feat_config.items(): self.re_feature_alphabets.append(Alphabet(k)) self.re_feature_name.append(k) self.re_feature_name2id[k] = id id += 1 self.re_feature_num = len(self.re_feature_alphabets) self.re_feature_emb_dims = [20] * self.re_feature_num self.re_feature_alphabet_sizes = [0] * self.re_feature_num if self.re_feat_config: for idx in range(self.re_feature_num): if self.re_feature_name[idx] in self.re_feat_config: self.re_feature_emb_dims[idx] = self.re_feat_config[ self.re_feature_name[idx]]['emb_size'] def build_re_feature_alphabets(self, tokens, entities, relations): entity_type_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_TYPE]']] entity_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY]']] relation_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[RELATION]']] token_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[TOKEN_NUM]']] entity_num_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[ENTITY_NUM]']] position_alphabet = self.re_feature_alphabets[ self.re_feature_name2id['[POSITION]']] for i, doc_token in enumerate(tokens): doc_entity = entities[i] doc_relation = relations[i] sent_idx = 0 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] while sentence.shape[0] != 0: entities_in_sentence = doc_entity[( doc_entity['sent_idx'] == sent_idx)] for _, entity in entities_in_sentence.iterrows(): entity_type_alphabet.add(entity['type']) tk_idx = entity['tf_start'] while tk_idx <= entity['tf_end']: entity_alphabet.add( my_utils1.normalizeWord(sentence.iloc[ tk_idx, 0])) # assume 'text' is in 0 column tk_idx += 1 sent_idx += 1 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] for _, relation in doc_relation.iterrows(): relation_alphabet.add(relation['type']) for i in range(data.max_seq_len): token_num_alphabet.add(i) entity_num_alphabet.add(i) position_alphabet.add(i) position_alphabet.add(-i) for idx in range(self.re_feature_num): self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[ idx].size() def fix_re_alphabet(self): for alphabet in self.re_feature_alphabets: alphabet.close() def open_re_alphabet(self): for alphabet in self.re_feature_alphabets: if alphabet.name == '[RELATION]': # label not open continue alphabet.open() def build_pretrain_emb(self): if self.word_emb_dir: logging.info("Load pretrained word embedding, dir: %s" % (self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim) def generate_instance(self, name, documents): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( documents, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: logging.info( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_re_instance(self, name, tokens, entities, relations, names): self.fix_re_alphabet() if name == "train": self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) elif name == "dev": self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) elif name == "test": self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2( tokens, entities, relations, names, self) else: logging.info( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def clear_data(self): self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] self.pretrain_word_embedding = None def read_config(self, config_file, opt): config = config_file_to_dict(config_file) ## read data: self.train_dir = opt.train_dir self.dev_dir = opt.dev_dir self.test_dir = opt.test_dir self.word_emb_dir = opt.word_emb_file the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = int(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) # both the_item = 'full_data' if the_item in config: self.full_data = str2bool(config[the_item]) the_item = 'tune_wordemb' if the_item in config: self.tune_wordemb = str2bool(config[the_item]) the_item = 'max_seq_len' if the_item in config: self.max_seq_len = int(config[the_item]) the_item = 'pad_idx' if the_item in config: self.pad_idx = int(config[the_item]) the_item = 'sent_window' if the_item in config: self.sent_window = int(config[the_item]) # the_item = 'output' # if the_item in config: # self.output = config[the_item] the_item = 'unk_ratio' if the_item in config: self.unk_ratio = float(config[the_item]) the_item = 'seq_feature_size' if the_item in config: self.seq_feature_size = int(config[the_item]) the_item = 're_feature' if the_item in config: self.re_feat_config = config[the_item] ## feat_config is a dict the_item = 'patience' if the_item in config: self.patience = int(config[the_item])
class Data: def __init__(self): self.substring_names = ['word', 'pos', 'char', 'bpe', 'word-pos'] self.substring_maxlen = 10 self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.norm_trans_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.translation_alphabet = Alphabet('translation') self.translation_id_format = {} self.feature_names = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label', True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### self.task_name = None ### I/O self.data_bin_dir = None self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.middle_dir = None self.viterbi_inputs_model_name = None self.trans_dir = None self.decode_dir = None self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.trans_embed_dir = None self.typeinfo_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_trans_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.trans_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 self.trans_emb_dim = 100 ###Classification ## Dataset Plus self.substring_dir = None self.bpe_emb_dir = None self.pos_emb_dir = None self.pretrain_bpe_embedding = None self.pretrain_pos_embedding = None self.bpe_emb_dim = 30 self.pos_emb_dim = 30 self.bpe_alphabet_size = 0 self.pos_alphabet_size = 0 self.norm_bpe_emb = False self.norm_pos_emb = False self.bpe_texts = [] self.bpe_Ids = [] self.pos_texts = [] self.pos_Ids = [] self.label_size = 0 self.substring_train_texts = None self.substring_train_Ids = None self.substring_dev_texts = None self.substring_dev_Ids = None self.substring_test_texts = None self.substring_test_Ids = None self.substring_label_alphabet = Alphabet('substring_label', True) ###Networks self.word_feature_extractor = "LSTM" # "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_seq_feature = "CNN" # "LSTM"/"CNN"/"GRU"/None self.use_trans = False self.use_crf = True self.nbest = None self.use_mapping = False self.mapping_func = None # tanh or sigmoid # Training self.save_model = True self.state_training_name = 'default' self.average_batch_loss = False self.optimizer = "SGD" # "SGD"/"Adam" self.status = "train" self.show_loss_per_batch = 100 # Hyperparameters self.seed_num = None self.cnn_layer = 4 self.iteration = 100 self.batch_size = 10 self.char_hidden_dim = 50 self.trans_hidden_dim = 50 self.hidden_dim = 200 self.dropout = 0.5 self.lstm_layer = 1 self.bilstm = True self.gpu = False self.lr = 0.015 self.lr_decay = 0.05 self.clip = None self.momentum = 0 self.l2 = 1e-8 # circul self.circul_time = 4 self.circul_deepth = 2 self.circul_gather_output_mode = "concat" # decode prepare self.decode_prepare_mode = 'example' def init_substring_instance(self): len_names = len(self.substring_names) self.substring_train_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_train_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_dev_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_texts = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] self.substring_test_Ids = [[[] for _ in range(self.substring_maxlen)] for _ in range(len_names)] def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Trans alphabet size: %s" % (self.trans_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Tran embedding dir: %s" % (self.trans_embed_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Tran embedding size: %s" % (self.trans_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Norm tran emb: %s" % (self.norm_trans_emb)) print("++" * 50) print(" task name: %s" % (self.task_name)) print("++" * 50) print(" Data bin file directory: %s" % (self.data_bin_dir)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Middle file directory: %s" % (self.middle_dir)) print(" viterbi inputs model name: %s" % (self.viterbi_inputs_model_name)) if self.typeinfo_dir: print(" typeinfo directory: %s" % (self.typeinfo_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char_seq_feature: %s" % (self.char_seq_feature)) print(" Model char_hidden_dim: %s" % (self.char_hidden_dim)) if self.use_trans: print(" Model trans_hidden_dim: %s" % (self.trans_hidden_dim)) if self.use_mapping: print(" Model mapping function: %s" % (self.mapping_func)) print(" " + "++" * 20) print(" Training:") print(" show_loss_per_batch: %s" % (self.show_loss_per_batch)) print(" save_model: %s" % (self.save_model)) print(" state_training_name: %s" % (self.state_training_name)) print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.iteration)) print(" BatchSize: %s" % (self.batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper seed_num: %s" % (self.seed_num)) print(" Hyper lr: %s" % (self.lr)) print(" Hyper lr_decay: %s" % (self.lr_decay)) print(" Hyper clip: %s" % (self.clip)) print(" Hyper momentum: %s" % (self.momentum)) print(" Hyper l2: %s" % (self.l2)) print(" Hyper hidden_dim: %s" % (self.hidden_dim)) print(" Hyper dropout: %s" % (self.dropout)) print(" Hyper lstm_layer: %s" % (self.lstm_layer)) print(" Hyper bilstm: %s" % (self.bilstm)) print(" Hyper GPU: %s" % (self.gpu)) print("DATA SUMMARY END.") print("++" * 50) print(" substring dir : %s" % (self.substring_dir)) print(" bpe_emb_dir dir : %s" % (self.bpe_emb_dir)) print(" pos_emb_dir dir : %s" % (self.pos_emb_dir)) print("++" * 50) print(" circul time : %s" % (self.circul_time)) print(" circul deepth : %s" % (self.circul_deepth)) print(" gather output mode : %s" % (self.circul_gather_output_mode)) print("++" * 50) print(" decode prepare mode : %s" % (self.decode_prepare_mode)) print("++" * 50) sys.stdout.flush() def make_substring_label_alphabet(self): for label in self.label_alphabet.instances: label = label.split('-')[-1] self.substring_label_alphabet.add(label) self.substring_label_alphabet.close() def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = 'feature_' + str(idx) self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_names.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): self.feature_emb_dims[idx] = self.feat_config[ self.feature_names[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_names[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_names[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_alphabet_substring(self, input_file_dir, substring_file_prefix): ## will not read lables input_files = os.listdir(input_file_dir) print input_files for input_file in input_files: plus_feature = '' input_file_name = os.path.split(input_file)[1] if input_file_name.split('.')[0] != substring_file_prefix: continue if 'bpe' in input_file_name: plus_feature = 'bpe' elif 'word' in input_file_name: plus_feature = 'word' if plus_feature == '': continue in_lines = open(input_file_dir + input_file, 'r').readlines() for line in in_lines: if len(line.strip()) > 0: pairs = line.strip().split('\t') words = pairs[0].decode('windows-1252') # word = pairs[0].decode('utf-8') if self.number_normalized: words = normalize_word(words) labels = pairs[-1] for word in words.split(): self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.translation_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.typeinfo_dir: type_info_matrix = [] with codecs.open(self.typeinfo_dir, 'r') as typeinfo_file: type_info_lines = typeinfo_file.readlines() for line in type_info_lines: line = line.rstrip().split() for i, _ in enumerate(line): line[i] = float(line[i]) line = np.array(line) type_info_matrix.append(line) print( "Caculate type info distribution,and concate word and type......" ) cos_res = [] for i, word_embed in enumerate(self.pretrain_word_embedding): word_type_info = [] if i == 0: word_type_info = np.random.random( size=len(type_info_matrix)) cos_res.append(word_type_info) else: for type_info in type_info_matrix: cos_sim = 1 - spatial.distance.cosine( word_embed, type_info) word_type_info.append(cos_sim) cos_res.append(word_type_info) cos_res = np.array(cos_res) cos_res = sigmoid(cos_res) self.pretrain_word_embedding = np.concatenate( [self.pretrain_word_embedding, cos_res], axis=1) print "type info length:{}".format(len(type_info_matrix)) self.word_emb_dim += len(type_info_matrix) print "new word dim is :{}".format(self.word_emb_dim) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) if self.trans_embed_dir: print("Load pretrained trans embedding, norm: %s, dir: %s" % (self.norm_trans_emb, self.trans_embed_dir)) self.pretrain_trans_embedding, self.trans_emb_dim = build_chi_pretrain_embedding( self.trans_embed_dir, self.translation_alphabet, self.trans_emb_dim, self.norm_trans_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.translation_id_format) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_substring(self, substring_file_prefix): self.init_substring_instance() self.make_substring_label_alphabet() input_files = os.listdir(self.substring_dir) print input_files for input_file in input_files: input_file_name = os.path.split(input_file)[1] input_file_dir = os.path.join(self.substring_dir, input_file_name) input_file_name_split = input_file_name.split('.') if input_file_name_split[0] != substring_file_prefix: continue print('dealing %s' % (input_file_name)) name = input_file_name_split[1] feature_name = input_file_name_split[2] f_l = int(input_file_name_split[-1][3:]) #feature_len if feature_name == 'word': alphabet = self.word_alphabet elif feature_name == 'char': alphabet = self.char_alphabet elif feature_name == 'pos': alphabet = self.feature_alphabets[0] elif feature_name == 'bpe': alphabet = self.feature_alphabets[1] s_f_id = self.substring_names.index( feature_name) #substring_feature_id if name == "train": self.substring_train_texts[s_f_id][f_l], self.substring_train_Ids[s_f_id][f_l]\ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testa": self.substring_dev_texts[s_f_id][f_l], self.substring_dev_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) elif name == "testb": self.substring_test_texts[s_f_id][f_l], self.substring_test_Ids[s_f_id][f_l] \ = read_instance_substring(input_file_dir, alphabet, self.substring_label_alphabet, self.number_normalized) else: print( "Error: you can only generate train/testa/testb instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## task: the_item = 'task_name' if the_item in config: self.task_name = config[the_item] ## read data: the_item = 'data_bin_dir' if the_item in config: self.data_bin_dir = config[the_item] the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'trans_dir' if the_item in config: self.trans_dir = config[the_item] the_item = 'middle_dir' if the_item in config: self.middle_dir = config[the_item] the_item = 'viterbi_inputs_model_name' if the_item in config: self.viterbi_inputs_model_name = config[the_item] the_item = 'substring_dir' if the_item in config: self.substring_dir = config[the_item] the_item = 'bpe_emb_dir' if the_item in config: self.bpe_emb_dir = config[the_item] the_item = 'pos_emb_dir' if the_item in config: self.pos_emb_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'trans_embed_dir' if the_item in config: self.trans_embed_dir = config[the_item] the_item = 'typeinfo_dir' if the_item in config: self.typeinfo_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) the_item = 'trans_emb_dim' if the_item in config: self.trans_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'use_trans' if the_item in config: self.use_trans = str2bool(config[the_item]) the_item = 'use_mapping' if the_item in config: self.use_mapping = str2bool(config[the_item]) the_item = 'mapping_func' if the_item in config: self.mapping_func = config[the_item] the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_seq_feature = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'save_model' if the_item in config: self.save_model = str2bool(config[the_item]) the_item = 'state_training_name' if the_item in config: self.state_training_name = config[the_item] the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] the_item = 'show_loss_per_batch' if the_item in config: self.show_loss_per_batch = int(config[the_item]) ## read Hyperparameters: the_item = 'seed_num' if the_item in config: if config[the_item] != 'None': self.seed_num = int(config[the_item]) the_item = 'cnn_layer' if the_item in config: self.cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.char_hidden_dim = int(config[the_item]) the_item = 'trans_hidden_dim' if the_item in config: self.trans_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: if config[the_item] == 'None': self.clip = None else: self.clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.l2 = float(config[the_item]) ###base2 the_item = 'feature_name' if the_item in config: self.feature_name = config[the_item] the_item = 'feature_length' if the_item in config: self.feature_length = int(config[the_item]) the_item = 'class_num' if the_item in config: self.class_num = int(config[the_item]) the_item = 'feature_ans' if the_item in config: self.feature_ans = config[the_item] ###circul the_item = 'circul_time' if the_item in config: self.circul_time = config[the_item] the_item = 'circul_deepth' if the_item in config: self.circul_deepth = config[the_item] the_item = 'circul_gather_output_mode' if the_item in config: self.circul_gather_output_mode = config[the_item] ###decode_prepare the_item = 'decode_prepare_mode' if the_item in config: self.decode_prepare_mode = config[the_item] def read_arg(self, args): if args.task_name != None: self.task_name = args.task_name if args.data_bin_dir != None: self.data_bin_dir = args.data_bin_dir if args.train_dir != None: self.train_dir = args.train_dir if args.dev_dir != None: self.dev_dir = args.dev_dir if args.test_dir != None: self.test_dir = args.test_dir if args.trans_dir != None: self.trans_dir = args.trans_dir if args.word_emb_dir != None: self.word_emb_dir = args.word_emb_dir if args.trans_embed_dir != None: self.trans_embed_dir = args.trans_embed_dir if args.middle_dir != None: self.middle_dir = args.middle_dir if args.viterbi_inputs_model_name != None: self.viterbi_inputs_model_name = args.viterbi_inputs_model_name if args.substring_dir != None: self.substring_dir = args.substring_dir if args.bpe_emb_dir != None: self.bpe_emb_dir = args.bpe_emb_dir if args.pos_emb_dir != None: self.pos_emb_dir = args.pos_emb_dir if args.model_dir != None: self.model_dir = args.model_dir if args.norm_word_emb != None: self.norm_word_emb = args.norm_word_emb if args.norm_char_emb != None: self.norm_char_emb = args.norm_char_emb if args.word_emb_dim != None: self.word_emb_dim = args.word_emb_dim if args.char_emb_dim != None: self.char_emb_dim = args.char_emb_dim if args.trans_emb_dim != None: self.trans_emb_dim = args.trans_emb_dim if args.number_normalized != None: self.number_normalized = args.number_normalized if args.seg != None: self.seg = args.seg if args.use_crf != None: self.use_crf = args.use_crf if args.use_char != None: self.use_char = args.use_char if args.use_trans != None: self.use_trans = args.use_trans if args.word_seq_feature != None: self.word_seq_feature = args.word_seq_feature if args.char_seq_feature != None: self.char_seq_feature = args.char_seq_feature if args.nbest != None: self.nbest = args.nbest if args.status != None: self.status = args.status if args.state_training_name != None: self.state_training_name = args.state_training_name if args.save_model != None: self.save_model = args.save_model if args.optimizer != None: self.optimizer = args.optimizer if args.iteration != None: self.iteration = args.iteration if args.batch_size != None: self.batch_size = args.batch_size if args.ave_batch_loss != None: self.ave_batch_loss = args.ave_batch_loss if args.show_loss_per_batch != None: self.show_loss_per_batch = args.show_loss_per_batch if args.seed_num != None: self.seed_num = args.seed_num if args.cnn_layer != None: self.cnn_layer = args.cnn_layer if args.char_hidden_dim != None: self.char_hidden_dim = args.char_hidden_dim if args.trans_hidden_dim != None: self.trans_hidden_dim = args.trans_hidden_dim if args.hidden_dim != None: self.hidden_dim = args.hidden_dim if args.dropout != None: self.dropout = args.dropout if args.lstm_layer != None: self.lstm_layer = args.lstm_layer if args.bilstm != None: self.bilstm = args.bilstm if args.learning_rate != None: self.learning_rate = args.learning_rate if args.lr_decay != None: self.lr_decay = args.lr_decay if args.momentum != None: self.momentum = args.momentum if args.l2 != None: self.l2 = args.l2 if args.gpu != None: self.gpu = args.gpu if args.clip != None: self.clip = args.clip ###base2 if args.feature_name != None: self.feature_name = args.feature_name if args.feature_length != None: self.feature_length = args.feature_length if args.class_num != None: self.class_num = args.class_num if args.feature_ans != None: self.feature_ans = args.feature_ans ###circul if args.circul_time != None: self.circul_time = args.circul_time if args.circul_deepth != None: self.circul_deepth = args.circul_deepth if args.circul_gather_output_mode != None: self.circul_gather_output_mode = args.circul_gather_output_mode ###decode_prepare if args.decode_prepare_mode != None: self.decode_prepare_mode = args.decode_prepare_mode def build_translation_alphabet(self, trans_path): print("Creating translation alphabet......") with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: if len(line.strip().split(":")) == 2: temp = line.strip().split(":", 1) words = temp[1].split() for word in words: self.translation_alphabet.add(word.strip()) self.trans_alphabet_size = self.translation_alphabet.size() def build_translation_dict(self, trans_path): print("Creating Id to Id translation dictionary......") translation_id_format_temp = {} with codecs.open(trans_path, 'r', "utf-8") as f: lines = f.readlines() for line in lines: ids = [] if len(line.strip().split(":", 1)) == 2: temp = line.strip().split(":", 1) word_id = self.word_alphabet.get_index(temp[0].strip()) translations = temp[1].split() for translation in translations: ids.append( self.translation_alphabet.get_index( translation.strip())) if ids == []: ids = [0] translation_id_format_temp[word_id] = ids for word in self.word_alphabet.instances: if self.word_alphabet.get_index( word) in translation_id_format_temp.keys(): self.translation_id_format[self.word_alphabet.get_index( word)] = translation_id_format_temp[ self.word_alphabet.get_index(word)] else: self.translation_id_format[self.word_alphabet.get_index( word)] = [0]
seen = set() unique_questions = [] for q, qid in zip(questions, qids): if qid not in seen: seen.add(qid) unique_questions.append(q) docs = answers + unique_questions # 计算doc frequency word2dfs = compute_dfs(docs) print word2dfs.items()[:10] ######### # 词典:单词与编号dict alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') add_to_vocab(answers, alphabet) add_to_vocab(questions, alphabet) basename = os.path.basename(train) cPickle.dump(alphabet, open(os.path.join(outdir, 'vocab.pickle'), 'w')) print "alphabet", len(alphabet) # 词典不重复词的数目 dummy_word_idx = alphabet.fid # map函数的原型是map(function, iterable, …),将function应用于iterable的每一个元素,结果以列表的形式返回 # 此处得到的是最长的questuon和answer的长度 q_max_sent_length = max(map(lambda x: len(x), questions)) a_max_sent_length = max(map(lambda x: len(x), answers)) print 'q_max_sent_length', q_max_sent_length
import cPickle import os from alphabet import Alphabet import operator data_dir = 'preprocessed_data' fnames = [ 'vocab_en300M', 'vocab_german40M', 'vocab_italian_44M', 'vocab_netherlands40M' ] new_alphabet = Alphabet(start_feature_id=0) new_alphabet.add('UNKNOWN_WORD_IDX') dummy_word_idx = new_alphabet.fid for fname in fnames: appfname = '{}.pickle'.format(fname) fname_vocab = os.path.join(data_dir, appfname) alphabet = cPickle.load(open(fname_vocab)) print "alphabet", len(alphabet) word_freq = map(lambda x: (x[0], x[1][1]), alphabet.items()) sorted_x = sorted(word_freq, key=operator.itemgetter(1), reverse=True)[:650000] print len(sorted_x) print sorted_x[0] for word, freq in sorted_x: new_alphabet.add(word)
class Data: def __init__(self, args): # Alphabet self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) # data self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.input_size = 0 self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyper parameters self.HP_word_emb_dim = args.embedding_size self.HP_char_emb_dim = args.char_embedding_size self.HP_iteration = args.max_epoch self.HP_batch_size = args.batch_size self.HP_char_hidden_dim = args.char_hidden_dim self.HP_hidden_dim = args.hidden_size self.HP_dropout = args.dropout self.HP_char_dropout = args.char_dropout self.HP_use_char = True if args.char_encoder else False self.HP_char_features = args.char_encoder self.HP_gpu = torch.cuda.is_available() and args.gpu self.HP_lr = args.lr self.HP_model_name = args.model_name self.HP_encoder_type = args.encoder self.HP_optim = args.optim self.HP_number_normalized = args.number_normalized self.HP_seed = args.seed self.HP_l2 = args.l2 self.HP_kernel_size = args.kernel_size self.HP_kernel_num = args.kernel_num # self.HP_lr_decay = 0.05 # self.HP_clip = None # self.HP_momentum = 0 # self.HP_lstm_layer = 1 # self.HP_bilstm = True def show_data_summary(self): print("DATA SUMMARY START:") print(" Word alphabet size: %s" % self.word_alphabet_size) print(" Char alphabet size: %s" % self.char_alphabet_size) print(" Label alphabet size: %s" % self.label_alphabet_size) print(" Word embedding size: %s" % self.HP_word_emb_dim) print(" Char embedding size: %s" % self.HP_char_emb_dim) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Hyper iteration: %s" % self.HP_iteration) print(" Hyper batch size: %s" % self.HP_batch_size) print(" Hyper lr: %s" % self.HP_lr) print(" Hyper hidden_dim: %s" % self.HP_hidden_dim) print(" Hyper dropout: %s" % self.HP_dropout) print(" Hyper GPU: %s" % self.HP_gpu) print(" Hyper use_char: %s" % self.HP_use_char) if self.HP_use_char: print(" Char_features: %s" % self.HP_char_features) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: line = line.strip() if line: pairs = line.strip().split() label = pairs[0].strip() self.label_alphabet.add(label) for word in pairs[2:]: if self.HP_number_normalized: word = normalize_word(word) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() def extend_word_char_alphabet(self, input_file_list): """ :param :return: """ old_word_size = self.word_alphabet_size old_char_size = self.char_alphabet_size for input_file in input_file_list: in_lines = open(input_file, 'r').readlines() for line in in_lines: line = line.strip() if line: pairs = line.strip().split() for word in pairs[2:]: if self.HP_number_normalized: word = normalize_word(word) # 如果单词中有数字,变为0 self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() print("Extend word/char alphabet finished!") print(" old word:%s -> new word:%s" % (old_word_size, self.word_alphabet_size)) print(" old char:%s -> new char:%s" % (old_char_size, self.char_alphabet_size)) for input_file in input_file_list: print(" from file:%s" % input_file) def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.HP_number_normalized) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % name) def build_word_pretrain_emb(self, emb_path): """ 预训练词向量 :param emb_path: :return: """ self.pretrain_word_embedding, self.HP_word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.HP_word_emb_dim) def build_char_pretrain_emb(self, emb_path): """ :param emb_path: :return: """ self.pretrain_char_embedding, self.HP_char_emb_dim = build_pretrain_embedding( emb_path, self.char_alphabet, self.HP_char_emb_dim)
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.feature_name2id = {} self.label_alphabet = Alphabet('label',True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.model_dir = None ## model save file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 # both self.full_data = False self.tune_wordemb = False # relation self.pretrain = None self.max_seq_len = 500 self.pad_idx = 1 self.sent_window = 3 self.output =None self.unk_ratio=1 self.seq_feature_size=256 self.max_epoch = 100 self.feature_extractor=None self.re_feature_name = [] self.re_feature_name2id = {} self.re_feature_alphabets = [] self.re_feature_num = len(self.re_feature_alphabets) self.re_feat_config = None self.re_train_X = [] self.re_dev_X = [] self.re_test_X = [] self.re_train_Y = [] self.re_dev_Y = [] self.re_test_Y = [] def show_data_summary(self): print("++"*50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s"%(self.word_emb_dir)) print(" Char embedding dir: %s"%(self.char_emb_dir)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm char emb: %s"%(self.norm_char_emb)) print(" Train file directory: %s"%(self.train_dir)) print(" Dev file directory: %s"%(self.dev_dir)) print(" Test file directory: %s"%(self.test_dir)) print(" Model file directory: %s"%(self.model_dir)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" FEATURE num: %s"%(self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) # for k, v in self.feat_config.items(): # print(" Feature: %s, size %s, norm %s, dir %s"%(k, v['emb_size'], v['emb_norm'], v['emb_dir'])) print(" "+"++"*20) print(" Model Network:") print(" Model use_crf: %s"%(self.use_crf)) print(" Model word extractor: %s"%(self.word_feature_extractor)) print(" Model use_char: %s"%(self.use_char)) if self.use_char: print(" Model char extractor: %s"%(self.char_feature_extractor)) print(" Model char_hidden_dim: %s"%(self.HP_char_hidden_dim)) print(" "+"++"*20) print(" Training:") print(" Optimizer: %s"%(self.optimizer)) print(" Iteration: %s"%(self.HP_iteration)) print(" BatchSize: %s"%(self.HP_batch_size)) print(" Average batch loss: %s"%(self.average_batch_loss)) print(" "+"++"*20) print(" Hyperparameters:") print(" Hyper lr: %s"%(self.HP_lr)) print(" Hyper lr_decay: %s"%(self.HP_lr_decay)) print(" Hyper HP_clip: %s"%(self.HP_clip)) print(" Hyper momentum: %s"%(self.HP_momentum)) print(" Hyper l2: %s"%(self.HP_l2)) print(" Hyper hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyper dropout: %s"%(self.HP_dropout)) print(" Hyper lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyper bilstm: %s"%(self.HP_bilstm)) print(" Hyper GPU: %s"%(self.HP_gpu)) print(" Hyper NBEST: %s"%(self.nbest)) print(" " + "++" * 20) print(" Both:") print(" full data: %s" % (self.full_data)) print(" Tune word embeddings: %s" % (self.tune_wordemb)) print(" "+"++"*20) print(" Relation:") print(" Pretrain directory: %s" % (self.pretrain)) print(" max sequence length: %s" % (self.max_seq_len)) print(" pad index: %s" % (self.pad_idx)) print(" sentence window: %s" % (self.sent_window)) print(" Output directory: %s" % (self.output)) print(" The ratio using negative instnaces 0~1: %s" % (self.unk_ratio)) print(" Size of seqeuence feature representation: %s" % (self.seq_feature_size)) print(" Iteration for relation training: %s" % (self.max_epoch)) print(" feature_extractor: %s" % (self.feature_extractor)) print(" RE FEATURE num: %s"%(self.re_feature_num)) for idx in range(self.re_feature_num): print(" Fe: %s alphabet size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.re_feature_alphabets[idx].name, self.re_feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.re_feature_alphabets[idx].name, self.re_norm_feature_embs[idx])) print(" RE Train instance number: %s"%(len(self.re_train_Y))) print(" RE Dev instance number: %s"%(len(self.re_dev_Y))) print(" RE Test instance number: %s"%(len(self.re_test_Y))) print("DATA SUMMARY END.") print("++"*50) sys.stdout.flush() def initial_feature_alphabets(self, input_file): items = open(input_file,'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: id = 0 for idx in range(1, total_column-1): feature_prefix = items[idx].split(']',1)[0]+"]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) self.feature_name2id[feature_prefix] = id id += 1 print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None]*self.feature_num self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None]*self.feature_num self.norm_feature_embs = [False]*self.feature_num self.feature_alphabet_sizes = [0]*self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx+1].split(']',1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def initial_re_feature_alphabets(self): id = 0 for k, v in self.re_feat_config.items(): self.re_feature_alphabets.append(Alphabet(k)) self.re_feature_name.append(k) self.re_feature_name2id[k] = id id += 1 self.re_feature_num = len(self.re_feature_alphabets) self.re_pretrain_feature_embeddings = [None]*self.re_feature_num self.re_feature_emb_dims = [20]*self.re_feature_num self.re_feature_emb_dirs = [None]*self.re_feature_num self.re_norm_feature_embs = [False]*self.re_feature_num self.re_feature_alphabet_sizes = [0]*self.re_feature_num if self.re_feat_config: for idx in range(self.re_feature_num): if self.re_feature_name[idx] in self.re_feat_config: self.re_feature_emb_dims[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_size'] self.re_feature_emb_dirs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_dir'] self.re_norm_feature_embs[idx] = self.re_feat_config[self.re_feature_name[idx]]['emb_norm'] def build_re_feature_alphabets(self, tokens, entities, relations): entity_type_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_TYPE]']] entity_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY]']] relation_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[RELATION]']] token_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[TOKEN_NUM]']] entity_num_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[ENTITY_NUM]']] position_alphabet = self.re_feature_alphabets[self.re_feature_name2id['[POSITION]']] for i, doc_token in enumerate(tokens): doc_entity = entities[i] doc_relation = relations[i] sent_idx = 0 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] while sentence.shape[0] != 0: entities_in_sentence = doc_entity[(doc_entity['sent_idx'] == sent_idx)] for _, entity in entities_in_sentence.iterrows(): entity_type_alphabet.add(entity['type']) tk_idx = entity['tf_start'] while tk_idx <= entity['tf_end']: entity_alphabet.add( my_utils1.normalizeWord(sentence.iloc[tk_idx, 0])) # assume 'text' is in 0 column tk_idx += 1 sent_idx += 1 sentence = doc_token[(doc_token['sent_idx'] == sent_idx)] for _, relation in doc_relation.iterrows(): relation_alphabet.add(relation['type']) for i in range(data.max_seq_len): token_num_alphabet.add(i) entity_num_alphabet.add(i) position_alphabet.add(i) position_alphabet.add(-i) for idx in range(self.re_feature_num): self.re_feature_alphabet_sizes[idx] = self.re_feature_alphabets[idx].size() def fix_re_alphabet(self): for alphabet in self.re_feature_alphabets: alphabet.close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def build_re_pretrain_emb(self): for idx in range(self.re_feature_num): if self.re_feature_emb_dirs[idx]: print("Load pretrained re feature %s embedding:, norm: %s, dir: %s" % (self.re_feature_name[idx], self.re_norm_feature_embs[idx], self.re_feature_emb_dirs[idx])) self.re_pretrain_feature_embeddings[idx], self.re_feature_emb_dims[idx] = build_pretrain_embedding( self.re_feature_emb_dirs[idx], self.re_feature_alphabets[idx], self.re_feature_emb_dims[idx], self.re_norm_feature_embs[idx]) def generate_instance(self, name, input_file): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance(input_file, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def generate_re_instance(self, name, tokens, entities, relations, names): self.fix_re_alphabet() if name == "train": self.re_train_X, self.re_train_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) elif name == "dev": self.re_dev_X, self.re_dev_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) elif name == "test": self.re_test_X, self.re_test_Y = relation_extraction.getRelationInstance2(tokens, entities, relations, names, self) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def load(self,data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self,save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def read_config(self,config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = int(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) # both the_item = 'full_data' if the_item in config: self.full_data = str2bool(config[the_item]) the_item = 'tune_wordemb' if the_item in config: self.tune_wordemb = str2bool(config[the_item]) # relation the_item = 'pretrain' if the_item in config: self.pretrain = config[the_item] the_item = 'max_seq_len' if the_item in config: self.max_seq_len = int(config[the_item]) the_item = 'pad_idx' if the_item in config: self.pad_idx = int(config[the_item]) the_item = 'sent_window' if the_item in config: self.sent_window = int(config[the_item]) the_item = 'output' if the_item in config: self.output = config[the_item] the_item = 'unk_ratio' if the_item in config: self.unk_ratio = float(config[the_item]) the_item = 'seq_feature_size' if the_item in config: self.seq_feature_size = int(config[the_item]) the_item = 'max_epoch' if the_item in config: self.max_epoch = int(config[the_item]) the_item = 'feature_extractor' if the_item in config: self.feature_extractor = config[the_item] the_item = 're_feature' if the_item in config: self.re_feat_config = config[the_item] ## feat_config is a dict
class Data: def __init__(self, input_file): self.original_data = open(input_file, 'r').readlines() self.index_data = [] self.word_alphabet = Alphabet('word') self.gloss_alphabet = Alphabet('gloss') self.entity_alphabet = Alphabet('entity') self.gaz_alphabet = Alphabet('gaz') self.label_alphabet = Alphabet('label') self.word_alphabet_size = 0 self.gloss_alphabet_size = 0 self.entity_alphabet_size = 0 self.gaz_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_gaz_hidden_dim = 50 self.HP_lstm_hidden_dim = 200 self.HP_dropout = 0.5 self.gaz_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = False self.HP_use_entity = False self.HP_use_gloss = True self.HP_use_gaz = False self.HP_gpu = True self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 self.HP_iteration = 100 # embedding hyperparameter self.word_emb_dim = 200 self.entity_emb_dim = 50 self.gloss_features = "CNN" #["CNN","LSTM"] self.gloss_emb_dim = 200 self.gloss_hidden_dim = 300 self.pretrain_word_embedding = np.array([]) self.pretrain_gaz_embedding = None self.word_embed_path = "../LOVECC/NYM.6B.200d.txt" #"NYM_200.txt" self.gaz_embed_path = None self.gaz_emb_dim = 200 self.HP_fix_gaz_emb = True def build_alphabet(self): in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] for word in words: self.word_alphabet.add(word) sentence_gloss = line["babel_gloss"] for word_gloss in sentence_gloss: for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: # for definates in phrase_gloss_EN: # for de_word in definates.split(): self.gloss_alphabet.add(de_word) entitys = line["entity_context"] for entity in entitys: self.entity_alphabet.add(entity) gazs = line["babel_phase"] for gaz in gazs: for item in gaz: self.gaz_alphabet.add(item) labels = line["detection_label"] for label in labels: self.label_alphabet.add(label) print(self.label_alphabet.get_content()) self.word_alphabet_size = self.word_alphabet.size() self.gloss_alphabet_size = self.gloss_alphabet.size() self.entity_alphabet_size = self.entity_alphabet.size() self.gaz_alphabet_size = self.gaz_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() self.word_alphabet.close() self.gloss_alphabet.close() self.entity_alphabet.close() self.gaz_alphabet.close() self.label_alphabet.close() def generate_instance_Ids(self): #把输入句子变成对应的标号(Id) in_lines = self.original_data for idx in range(len(in_lines)): line = json.loads(in_lines[idx]) words = line["word_context"] words_Id = [] for word in words: words_Id.append(self.word_alphabet.get_index(word)) sentence_gloss = line["babel_gloss"] sentence_glosses_Id = [] for word_gloss in sentence_gloss: word_glosses_Id = [] for phrase_gloss in word_gloss: #一个词可以匹配多个词组 if "EN" in phrase_gloss: phrase_gloss_EN = phrase_gloss["EN"] #这是个list final_gloss = " . ".join(phrase_gloss_EN) for de_word in final_gloss: word_glosses_Id.append( self.gloss_alphabet.get_index(de_word)) sentence_glosses_Id.append(word_glosses_Id) entitys = line["entity_context"] entitys_Id = [] for entity in entitys: entitys_Id.append(self.entity_alphabet.get_index(entity)) gazs = line["babel_phase"] sentence_gazs_Id = [ ] #gazs_Id=[[[take over,take over of,...],[2,3,...]],[[legal,legal procedures,...],[1,2,...]],...,[[open the window,open the window please,...],[3,4,...]]] for gaz in gazs: word_gazs_Id = [] Ids = [] Lens = [] for item in gaz: Ids.append(self.gaz_alphabet.get_index(item)) Lens.append(len(item.split())) word_gazs_Id = [Ids, Lens] sentence_gazs_Id.append(word_gazs_Id) labels = line["detection_label"] labels_Id = [] for label in labels: labels_Id.append(self.label_alphabet.get_index(label)) self.index_data.append([ words_Id, entitys_Id, sentence_gazs_Id, sentence_glosses_Id, labels_Id ]) def load_pretrain_emb(self, embedding_path): lines = open(embedding_path, 'r', encoding="utf-8").readlines() statistic = lines[0].strip() #开头的两个统计数据:单词数,向量长度 # print(statistic) embedd_dim = int(statistic.split()[1]) embedd_dict = dict() embedd_dict["<pad>"] = [0.0 for i in range(embedd_dim)] #填充词对应的向量置为全零 # print(len(embedd_dict["<pad>"])) for line in lines[1:]: line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)) embedd_dict[tokens[0]] = [float(i) for i in tokens[1:]] return embedd_dict, embedd_dim def norm2one(self, vec): if np.sum(vec) == 0: return vec root_sum_square = np.sqrt(np.sum(np.square(vec))) return vec / root_sum_square def build_pretrain_embedding(self, embedding_path, word_alphabet, embedd_dim=200, norm=True): embedd_dict = dict() if embedding_path != None: # 读取embedding字典 embedd_dict, embedd_dim = self.load_pretrain_emb(embedding_path) scale = np.sqrt(3.0 / embedd_dim) pretrain_emb = np.zeros([word_alphabet.size(), embedd_dim]) #pretrain_emb就是重排之后的embedding矩阵 perfect_match = 0 case_match = 0 not_match = 0 for word, index in word_alphabet.get_alphabet().items(): if word in embedd_dict: # print(word,index) # print(len(embedd_dict[word])) if norm: pretrain_emb[index] = self.norm2one(embedd_dict[word]) else: pretrain_emb[index] = embedd_dict[word] perfect_match += 1 elif word.lower() in embedd_dict: if norm: pretrain_emb[index] = self.norm2one( embedd_dict[word.lower()]) else: pretrain_emb[index] = embedd_dict[word.lower()] case_match += 1 else: pretrain_emb[index] = np.random.uniform( -scale, scale, [1, embedd_dim]) not_match += 1 pretrained_size = len(embedd_dict) # print("pad's embedding:",pretrain_emb[word_alphabet.get_index(",")]) print( "Embedding:\n pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % (pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / word_alphabet.size())) return pretrain_emb, embedd_dim #pretrain_emb就是根据alphabet的顺序重排embedding矩阵,embedd_dim是向量的纬度 def generate_embedding(self): self.pretrain_word_embedding, self.word_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.word_alphabet) self.pretrain_gloss_embedding, self.gloss_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gloss_alphabet) self.pretrain_gaz_embedding, self.gaz_pretrain_dim = self.build_pretrain_embedding( self.word_embed_path, self.gaz_alphabet)
# -*- coding: utf-8 -*- # @Author: Shaowei Chen, Contact: [email protected] # @Date: 2020-4-27 import sys import argparse import torch from alphabet import Alphabet sys.path.append("../") word_alphabet = Alphabet('word', True) label_alphabet = Alphabet('label', True) label_alphabet.add("O") label_alphabet.add("B") label_alphabet.add("I") relation_alphabet = Alphabet('relation', True) char_alphabet = Alphabet('char', True) class InputFeatures(object): """A single set of features of data.""" def __init__(self, tokens, token_ids, token_mask, chars, char_ids, char_mask, charLength, tokenLength, labels, label_ids, relations, gold_relations): self.tokens = tokens self.token_ids = token_ids self.token_mask = token_mask self.tokenLength = tokenLength self.labels = labels self.label_ids = label_ids self.relations = relations
def main(argv): vocab_dir = 'preprocessed_data' load_vocab = False parse_200M = True smiley_tweets_fname = '' smiley_tweets = '' fname_vocab = '' n_max_tweets = np.inf outdir = '' parse_random_tweets = False try: opts, args = getopt.getopt( argv, "v:t:m:nr", ["vocab=", "tweets=", "max_tweets=", "no_big="]) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt in ("-v", "--vocab"): load_vocab = True fname_vocab = os.path.join(vocab_dir, '{}.pickle'.format(arg)) elif opt in ("-t", "--tweets"): smiley_tweets_fname = arg smiley_tweets = 'semeval/{}.gz'.format(arg) outdir = 'preprocessed_data_{}'.format(arg) if not os.path.exists(outdir): os.makedirs(outdir) model_dir = 'misc/{}'.format(arg) if not os.path.exists(model_dir): os.makedirs(model_dir) elif opt in ("-m", "--max_tweets"): n_max_tweets = int(arg) elif opt == '-n': parse_200M = False elif opt == '-r': parse_random_tweets = True dev2013 = "semeval/dev2013-task-B.tsv" dev2016 = "semeval/dev2016-task-A.tsv" devtest2016 = "semeval/devtest2016-task-A.tsv" test2013_sms = "semeval/test2013sms-task-B.tsv" test2013_twitter = "semeval/test2013-task-B.tsv" test2014_livejournal = "semeval/test2014lj-task-B.tsv" test2014_sarcasm = "semeval/test2014sarcasm-task-B.tsv" test2014_twitter = "semeval/test2014-task-B.tsv" test2015 = "semeval/test2015-task-B.tsv" test2016 = "semeval/test2016-task-A.tsv" train2013 = "semeval/train2013-task-B.tsv" train16 = "semeval/train2016-task-A.tsv" de_train = "semeval/de_train.tsv" de_test = "semeval/de_test.tsv" it_test = "semeval/it_test.tsv" it_train = "semeval/it_train.tsv" nl_train = "semeval/nl_train.tsv" nl_test = "semeval/nl_test.tsv" de_en_test = "semeval/de_eng_n.tsv" de_no_en_test = "semeval/de_no_eng_n.tsv" if load_vocab: alphabet = cPickle.load(open(fname_vocab)) dummy_word_idx = alphabet.get('DUMMY_WORD_IDX', DUMMY_WORD_IDX) print "alphabet", len(alphabet) print 'dummy_word:', dummy_word_idx else: alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') alphabet.add('DUMMY_WORD_IDX') dummy_word_idx = DUMMY_WORD_IDX print "Loading Semeval Data" #ncol is the number of columns iside the files in semeval files = [ (train2013, 4), (dev2013, 4), (test2013_sms, 4), (test2013_twitter, 4), (test2014_twitter, 4), (test2014_livejournal, 4), (test2014_sarcasm, 4), (test2015, 4), (train16, 3), (dev2016, 3), (devtest2016, 3), (test2016, 3), (de_test, 4), (de_train, 4), (it_test, 4), (it_train, 4), (nl_test, 4), (nl_train, 4), (de_en_test, 4), (de_no_en_test, 4), ] if parse_random_tweets: outdir = outdir + '_random' files = map(lambda x: (os.path.join('random_tweets', x), 3), os.listdir('random_tweets')) if not os.path.exists(outdir): os.makedirs(outdir) for fname, ncols in files: tid, tweets, sentiments = load_data(fname, alphabet, ncols=ncols) print "Number of tweets:", len(tweets) tweet_idx = p_utils.convert2indices(tweets, alphabet, dummy_word_idx) basename, _ = os.path.splitext(os.path.basename(fname)) np.save(os.path.join(outdir, '{}.tids.npy'.format(basename)), tid) np.save(os.path.join(outdir, '{}.tweets.npy'.format(basename)), tweet_idx) np.save(os.path.join(outdir, '{}.sentiments.npy'.format(basename)), sentiments) if parse_200M: print "Loading Smiley Data" basename, _ = os.path.splitext(os.path.basename('smiley_tweets')) nTweets = p_utils.store_file( smiley_tweets, os.path.join(outdir, '{}.tweets.npy'.format(basename)), alphabet, dummy_word_idx, sentiment_fname=os.path.join(outdir, '{}.sentiments.npy'.format(basename)), max_tweets=n_max_tweets) print "Number of tweets:", nTweets nTf = open('misc/{}/nTweets.txt'.format(smiley_tweets_fname), 'wb') nTf.write(str(nTweets)) nTf.close() cPickle.dump(alphabet, open(os.path.join(outdir, 'last_vocab.pickle'), 'wb'))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True # self.punctuation_filter = True self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "NoSeg" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = True self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 30 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = False self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): addLogSectionMark("DATA SUMMARY") print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) # print(" Punctuation filter: %s" % (self.punctuation_filter)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) logger.info(" Tag scheme: %s" % (self.tagScheme)) logger.info(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) logger.info(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) logger.info(" Number normalized: %s" % (self.number_normalized)) logger.info(" Use bigram: %s" % (self.use_bigram)) logger.info(" Word alphabet size: %s" % (self.word_alphabet_size)) logger.info(" Biword alphabet size: %s" % (self.biword_alphabet_size)) logger.info(" Char alphabet size: %s" % (self.char_alphabet_size)) logger.info(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) logger.info(" Label alphabet size: %s" % (self.label_alphabet_size)) logger.info(" Word embedding size: %s" % (self.word_emb_dim)) logger.info(" Biword embedding size: %s" % (self.biword_emb_dim)) logger.info(" Char embedding size: %s" % (self.char_emb_dim)) logger.info(" Gaz embedding size: %s" % (self.gaz_emb_dim)) logger.info(" Norm word emb: %s" % (self.norm_word_emb)) logger.info(" Norm biword emb: %s" % (self.norm_biword_emb)) logger.info(" Norm gaz emb: %s" % (self.norm_gaz_emb)) logger.info(" Norm gaz dropout: %s" % (self.gaz_dropout)) logger.info(" Train instance number: %s" % (len(self.train_texts))) logger.info(" Dev instance number: %s" % (len(self.dev_texts))) logger.info(" Test instance number: %s" % (len(self.test_texts))) logger.info(" Raw instance number: %s" % (len(self.raw_texts))) logger.info(" Hyperpara iteration: %s" % (self.HP_iteration)) logger.info(" Hyperpara batch size: %s" % (self.HP_batch_size)) logger.info(" Hyperpara lr: %s" % (self.HP_lr)) logger.info(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) logger.info(" Hyperpara HP_clip: %s" % (self.HP_clip)) logger.info(" Hyperpara momentum: %s" % (self.HP_momentum)) logger.info(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) logger.info(" Hyperpara dropout: %s" % (self.HP_dropout)) logger.info(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) logger.info(" Hyperpara bilstm: %s" % (self.HP_bilstm)) logger.info(" Hyperpara GPU: %s" % (self.HP_gpu)) logger.info(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) logger.info(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) logger.info(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): ## build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "sentence": self.raw_texts, self.raw_Ids = read_instance_with_gaz_text( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file)) def write_decoded_results_back(self, predict_results, name): sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) result = [] for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] print(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') for idx in range(sent_num): sent_length = len(predict_results[idx]) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' for idy in range(sent_length): pre_su_item = predict_results[idx][idy].split('-') if pre_su_item[0] == 'S': data['start'] = str(idy) data['end'] = str(idy + 1) data['value'] = content_list[idx][0][idy].encode('utf-8') data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} if pre_su_item[0] == 'B': data['start'] = str(idy) value = value + (content_list[idx][0][idy].encode('utf-8')) if pre_su_item[0] == 'E': value = value + (content_list[idx][0][idy].encode('utf-8')) data['end'] = str(idy + 1) data['value'] = value data['entity'] = pre_su_item[1] result.append(data) data = {'start': '', 'end': "", 'value': '', 'entity': ''} value = '' if pre_su_item[0] == 'I': value = value + (content_list[idx][0][idy].encode('utf-8')) return result def write_http_data(self, output_file, inputData, name): fout = open(output_file, 'w') get_num = len(inputData) start = 0 numOfParagram = int(math.ceil(get_num / 5.0)) num_start_sentence = start num_end_sentence = numOfParagram if name == "test": num_start_sentence = 0 num_end_sentence = numOfParagram elif name == "dev": num_start_sentence = numOfParagram num_end_sentence = numOfParagram * 2 elif name == "train": num_start_sentence = numOfParagram * 2 num_end_sentence = get_num for idx in range(num_start_sentence, num_end_sentence): text = inputData[idx]["text"] entities = inputData[idx]["entities"] idText = 1 inWord = False tagReady = False entity_name = '' for Text in text: ## content_list[idx] is a list with [word, char, label] tagReady = False for entity in entities: if not inWord: if entity['start'] + 1 == entity['end'] and entity[ 'end'] == idText: fout.write( Text.encode('utf-8') + " " + "S-" + entity['entity'].encode('utf-8') + '\n') tagReady = True break if entity['start'] + 1 == idText: fout.write( Text.encode('utf-8') + " " + "B-" + entity['entity'].encode('utf-8') + '\n') tagReady = True inWord = True entity_name = entity['entity'].encode('utf-8') break else: if entity['end'] == idText: fout.write( Text.encode('utf-8') + " " + "E-" + entity_name + '\n') tagReady = True inWord = False break if not tagReady: if not inWord: fout.write(Text.encode('utf-8') + " " + "O" + '\n') else: fout.write( Text.encode('utf-8') + " " + "I-" + entity_name + '\n') idText = idText + 1 fout.write('\n') fout.close() print("Predict input data has been written into file. %s" % (output_file))
if opt.use_char: enc_char_alphabet = Alphabet('enc_char') else: enc_char_alphabet = None if opt.method == 'cla': dec_word_alphabet = None dec_char_alphabet = None else: dec_word_alphabet = Alphabet('dec_word') if opt.use_char: dec_char_alphabet = Alphabet('dec_char') else: dec_char_alphabet = None dec_word_alphabet.add('<SOS>') dec_word_alphabet.add('<EOS>') build_alphabet(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet, dec_char_alphabet, train_datapoints) build_alphabet_1(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet, dec_char_alphabet, dev_datapoints) if len(test_documents) != 0: build_alphabet_1(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet, dec_char_alphabet, test_datapoints) if opt.pretraining: build_alphabet(enc_word_alphabet, enc_char_alphabet, dec_word_alphabet, dec_char_alphabet, dict_datapoints)
class BinarySource( Source ): """ Source for binary classification data in following format: one example per line with feature-value pair separated by separator symbol (' ' by default). E.g.: 1 f1:1.0 f2:1.0 f3:1.0 -1 f2:1.0 f3:1.0 f8:1.0 -1 f1:1.0 f2:1.0 1 f8:1.0 f9:1.0 f10:1.0 """ def __init__( self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@" ): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return def _parse( self ): """ return parsed line """ sep = self._sep for line in self._stream: line = line.rstrip() items = line.split() cl = items[0] assert cl in [POS_LAB, NEG_LAB] feats = [] if self._bias: feats.append( (self._bias_prefix, 1.0) ) # implicit bias for s in items[1:]: try: f,v = s.rsplit(sep, 1) v = float(v) feats.append( (f,v) ) except ValueError: sys.exit("Datasource error: make sure you use the right datasource format.") yield ( cl, feats ) def _populate_alphabet( self ): print >> sys.stderr, "Populating feature alphabet... ", self.unlock_alphabet() if self._stream_type == "generator": for i, gen_inst in enumerate(self._stream): # read stream directly sys.stderr.write("%s" %"\b"*len(str(i))+str(i)) featvals = gen_inst.get_featvals() for (f,_) in featvals: self._feature_alphabet.add(f) else: try: for tag,feats in self._parse(): for f,_ in feats: self._feature_alphabet.add( f ) except ValueError: sys.exit("Datasource error: make sure you use the right data format.") # rewind stream try: self.rewind() except TypeError: sys.exit("TypeError: make sure rewind() is used only on files.") print >> sys.stderr, " done." print >> sys.stderr, "Number of features: %s" %self._feature_alphabet.size() return def unlock_alphabet( self ): self._feature_alphabet.unlock() return def lock_alphabet( self ): self._feature_alphabet.lock() return def set_alphabet( self, feature_alphabet ): self._feature_alphabet = feature_alphabet return def get_alphabet( self ): return self._feature_alphabet def get_input( self ): for label,feats in self._parse(): yield label, feats def __iter__( self ): """ instance generator """ feature_alphabet = self._feature_alphabet assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!" if self._stream_type in ["file","list"]: for idx,(label,feats) in enumerate(self._parse()): if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in feats: feature_alphabet.add(f) instance = self._Instance(idx, label, feats, feature_alphabet) yield instance elif self._stream_type == "generator": for idx, gen_inst in enumerate(self._stream): # read stream directly featvals = gen_inst.get_featvals() label = gen_inst.get_label() if not feature_alphabet.locked(): # dynamic feature alphabet for (f,_) in featvals: feature_alphabet.add(f) instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet) yield instance def size( self ): s = len(list(self._stream)) self.rewind() return s
class VsmNormer(nn.Module): def __init__(self): super(VsmNormer, self).__init__() self.word_alphabet = Alphabet('word') self.embedding_dim = None self.word_embedding = None self.dict_alphabet = Alphabet('dict') self.dict_embedding = None self.gpu = opt.gpu def transfer_model_into_gpu(self): if torch.cuda.is_available(): self.word_embedding = self.word_embedding.cuda(self.gpu) self.dict_embedding = self.dict_embedding.cuda(self.gpu) def batch_name_to_ids(self, name): tokens = my_tokenize(name) length = len(tokens) tokens_id = np.zeros((1, length), dtype=np.int) for i, word in enumerate(tokens): word = norm_utils.word_preprocess(word) tokens_id[0][i] = self.word_alphabet.get_index(word) tokens_id = torch.from_numpy(tokens_id) if torch.cuda.is_available(): return tokens_id.cuda(self.gpu) else: return tokens_id def init_vector_for_dict(self, meddra_dict): self.dict_embedding = nn.Embedding(len(meddra_dict), self.embedding_dim) if torch.cuda.is_available(): self.dict_embedding = self.dict_embedding.cuda(self.gpu) for concept_id, concept_name in meddra_dict.items(): self.dict_alphabet.add(concept_id) with torch.no_grad(): tokens_id = self.batch_name_to_ids(concept_name) length = tokens_id.size(1) emb = self.word_embedding(tokens_id) emb = emb.unsqueeze_(1) pool = functional.avg_pool2d(emb, (length, 1)) index = norm_utils.get_dict_index(self.dict_alphabet, concept_id) self.dict_embedding.weight.data[index] = pool[0][0] def compute_similarity(self, mention_rep, concep_rep): # mention_rep is (batch, emb_dim) and concep_rep is (concept_num, emb_dim) mention_rep_norm = torch.norm(mention_rep, 2, 1, True) # batch 1 concep_rep_norm = torch.norm(concep_rep, 2, 1, True) # concept 1 a = torch.matmul(mention_rep_norm, torch.t(concep_rep_norm)) # batch, concept a = a.clamp(min=1e-8) b = torch.matmul(mention_rep, torch.t(concep_rep)) # batch, concept return b / a def forward(self, mention_word_ids): length = mention_word_ids.size(1) mention_word_emb = self.word_embedding(mention_word_ids) mention_word_emb = mention_word_emb.unsqueeze_(1) mention_word_pool = functional.avg_pool2d(mention_word_emb, (length, 1)) # batch,1,1,100 mention_word_pool = mention_word_pool.squeeze_(1).squeeze_( 1) # batch,100 # similarities = torch.t(torch.matmul(self.dict_embedding.weight.data, torch.t(mention_word_pool))) # batch, dict similarities = self.compute_similarity(mention_word_pool, self.dict_embedding.weight.data) values, indices = torch.max(similarities, 1) return values, indices def process_one_doc(self, doc, entities, dict): for entity in entities: with torch.no_grad(): tokens_id = self.batch_name_to_ids(entity.name) values, indices = self.forward(tokens_id) norm_id = norm_utils.get_dict_name(self.dict_alphabet, indices.item()) name = dict[norm_id] entity.norm_ids.append(norm_id) entity.norm_names.append(name) entity.norm_confidences.append(values.item())
class Data: def __init__(self, opt): self.train_data = None self.dev_data = None self.test_data = None self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.label_alphabet = Alphabet('label', True) self.train_texts = None self.train_Ids = None self.dev_texts = None self.dev_Ids = None self.test_texts = None self.test_Ids = None self.pretrain_word_embedding = None self.word_emb_dim = opt.word_emb_dim self.config = self.read_config(opt.config) self.feat_config = None the_item = 'ner_feature' if the_item in self.config: self.feat_config = self.config[the_item] ## [POS]:{emb_size:20} self.feature_alphabets = [] self.feature_emb_dims = [] for k, v in self.feat_config.items(): self.feature_alphabets.append(Alphabet(k)) self.feature_emb_dims.append(int(v['emb_size'])) def clear(self): self.train_data = None self.dev_data = None self.test_data = None self.train_texts = None self.train_Ids = None self.dev_texts = None self.dev_Ids = None self.test_texts = None self.test_Ids = None self.pretrain_word_embedding = None def build_alphabet(self, data): for document in data: for sentence in document.sentences: for token in sentence: word = token['text'] if opt.ner_number_normalized: word = normalize_word(word) self.word_alphabet.add(word) if token.get('label') is not None: self.label_alphabet.add(token['label']) # try: # self.label_alphabet.add(token['label']) # except Exception, e: # print("document id {} {} {}".format(document.name)) # exit() if self.feat_config is not None: for alphabet in self.feature_alphabets: if alphabet.name == '[POS]': alphabet.add(token['pos']) elif alphabet.name == '[Cap]': alphabet.add(token['cap']) for char in word: self.char_alphabet.add(char) def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pk.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pk.dump(self.__dict__, f, 2) f.close() def read_config(self, config_file): config = config_file_to_dict(config_file) return config
def main(argv): outdir = "preprocessed_data" out_file = '' out_reduced = '' in_file = '' max_tweets = np.inf fwemb_vocabulary = None try: opts, args = getopt.getopt( argv, "i:o:f:m:", ["ifile=", "ofile=", "wfilter", 'maxTweets']) except getopt.GetoptError: print 'test.py -i <inputfile> -o <outputfile>' sys.exit(2) for opt, arg in opts: if opt in ("-o", "--ofile"): out_file = '{}.pickle'.format(arg) out_reduced = '{}_reduced.pickle'.format(arg) elif opt in ("-i", "--ifile"): in_file = 'semeval/{}.gz'.format(arg) elif opt in ('-f', '--wfilter'): fwemb_vocabulary = load_glove_vocabulary( 'embeddings/{}'.format(arg), ' ') elif opt in ('-m', '--maxTweets'): max_tweets = int(arg) print outdir if not os.path.exists(outdir): os.makedirs(outdir) #unsupervised data alphabet = Alphabet(start_feature_id=0) alphabet.add('UNKNOWN_WORD_IDX') dummy_word_idx = alphabet.fid tknzr = TweetTokenizer(reduce_len=True) fnames_gz = [in_file] counter = 0 for fname in fnames_gz: with gzip.open(fname, 'r') as f: for tweet in tqdm(f): tweet = tknzr.tokenize(preprocess_tweet(tweet)) for token in tweet: if fwemb_vocabulary: if token in fwemb_vocabulary: alphabet.add(token) else: alphabet.add(token) counter += 1 if (counter % 1000000) == 0: print 'Processed tweets: {}'.format(counter) print 'Alphabet Lenght: {}'.format(len(alphabet)) if counter > max_tweets: break print len(alphabet) print 'Alphabet before purge:', len(alphabet) cPickle.dump(alphabet, open(os.path.join(outdir, out_file), 'wb')) for word, (idx, freq) in tqdm(alphabet.items()): if freq > 10: alphabet.add(word) alphabet.add('DUMMY_WORD_IDX"') print "Alphabet after purge:", len(alphabet) cPickle.dump(alphabet, open(os.path.join(outdir, out_reduced), 'wb'))
class BinarySource(Source): """ Source for binary classification data in following format: one example per line with feature-value pair separated by separator symbol (' ' by default). E.g.: 1 f1:1.0 f2:1.0 f3:1.0 -1 f2:1.0 f3:1.0 f8:1.0 -1 f1:1.0 f2:1.0 1 f8:1.0 f9:1.0 f10:1.0 """ def __init__(self, data, encoding="utf-8", feature_alphabet=None, alphabet_pop=True, alphabet_lock=True, sep=":", bias=False, bias_prefix="@@BIAS@@"): Source.__init__(self, data, encoding=encoding) self._Instance = BinaryClassificationInstance if feature_alphabet != None: self._feature_alphabet = feature_alphabet else: self._feature_alphabet = Alphabet(locked=False) self._sep = sep self._bias = bias self._bias_prefix = bias_prefix if alphabet_pop: self._populate_alphabet() if alphabet_lock: self.lock_alphabet() else: self.unlock_alphabet() return def _parse(self): """ return parsed line """ sep = self._sep for line in self._stream: line = line.rstrip() items = line.split() cl = items[0] assert cl in [POS_LAB, NEG_LAB] feats = [] if self._bias: feats.append((self._bias_prefix, 1.0)) # implicit bias for s in items[1:]: try: f, v = s.rsplit(sep, 1) v = float(v) feats.append((f, v)) except ValueError: sys.exit( "Datasource error: make sure you use the right datasource format." ) yield (cl, feats) def _populate_alphabet(self): print >> sys.stderr, "Populating feature alphabet... ", self.unlock_alphabet() if self._stream_type == "generator": for i, gen_inst in enumerate(self._stream): # read stream directly sys.stderr.write("%s" % "\b" * len(str(i)) + str(i)) featvals = gen_inst.get_featvals() for (f, _) in featvals: self._feature_alphabet.add(f) else: try: for tag, feats in self._parse(): for f, _ in feats: self._feature_alphabet.add(f) except ValueError: sys.exit( "Datasource error: make sure you use the right data format." ) # rewind stream try: self.rewind() except TypeError: sys.exit("TypeError: make sure rewind() is used only on files.") print >> sys.stderr, " done." print >> sys.stderr, "Number of features: %s" % self._feature_alphabet.size( ) return def unlock_alphabet(self): self._feature_alphabet.unlock() return def lock_alphabet(self): self._feature_alphabet.lock() return def set_alphabet(self, feature_alphabet): self._feature_alphabet = feature_alphabet return def get_alphabet(self): return self._feature_alphabet def get_input(self): for label, feats in self._parse(): yield label, feats def __iter__(self): """ instance generator """ feature_alphabet = self._feature_alphabet assert not (feature_alphabet.empty() and feature_alphabet.locked()), "Feature alphabet is empty!" if self._stream_type in ["file", "list"]: for idx, (label, feats) in enumerate(self._parse()): if not feature_alphabet.locked(): # dynamic feature alphabet for (f, _) in feats: feature_alphabet.add(f) instance = self._Instance(idx, label, feats, feature_alphabet) yield instance elif self._stream_type == "generator": for idx, gen_inst in enumerate( self._stream): # read stream directly featvals = gen_inst.get_featvals() label = gen_inst.get_label() if not feature_alphabet.locked(): # dynamic feature alphabet for (f, _) in featvals: feature_alphabet.add(f) instance = self._Instance(idx, label, featvals, label_alphabet, feature_alphabet) yield instance def size(self): s = len(list(self._stream)) self.rewind() return s
def create_alphabets(alphabet_directory, data_paths, max_vocabulary_size, normalize_digits=True): logger = utils.get_logger("Create Alphabets") word_alphabet = Alphabet('word') pos_alphabet = Alphabet('pos') type_alphabet = Alphabet('type') if not gfile.Exists(alphabet_directory): logger.info("Creating Alphabets: %s" % alphabet_directory) pos_alphabet.add(ROOT_POS) type_alphabet.add(ROOT_TYPE) pos_alphabet.add(PAD_POS) type_alphabet.add(PAD_TYPE) vocab = dict() for data_path in data_paths: logger.info("Processing data: %s" % data_path) with gfile.GFile(data_path, mode="r") as file: for line in file: line = line.decode('utf-8') line = line.strip() if len(line) == 0: continue tokens = line.split() word = DIGIT_RE.sub( b"0", tokens[1]) if normalize_digits else tokens[1] pos = tokens[4] type = tokens[7] pos_alphabet.add(pos) type_alphabet.add(type) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) logger.info("Total Vocabulary Size: %d" % len(vocab_list)) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Type Alphabet Size: %d" % type_alphabet.size()) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] for word in vocab_list: word_alphabet.add(word) word_alphabet.save(alphabet_directory) pos_alphabet.save(alphabet_directory) type_alphabet.save(alphabet_directory) else: word_alphabet.load(alphabet_directory) pos_alphabet.load(alphabet_directory) type_alphabet.load(alphabet_directory) word_alphabet.close() pos_alphabet.close() type_alphabet.close() return word_alphabet, pos_alphabet, type_alphabet
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = {0: Alphabet('label', True)} self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] #Added for pretraining self.PRETRAINED_ALL = "all" self.PRETRAINED_LSTMS = "lstms" self.pretrained_model = None self.pretrained_part = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 #self.label_alphabet_size = 0 self.label_alphabet_sizes = {0: 0} self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_feature_default_size = 20 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 #D: The number of tasks to be solved self.HP_tasks = 1 self.HP_main_tasks = self.HP_tasks self.HP_tasks_weights = [1] self.optimize_with_evalb = False self.optimize_with_las = False self.offset = False self.choice_of_best_model = "avg" self.language = "English" # self.HP_tasks_inputs = [self.LSTMOUT] #Policy Gradient self.No_samples = 8 self.pg_variance_reduce = True self.variance_reduce_burn_in = 999 self.pg_valsteps = 1000 self.entropy_regularisation = True self.entropy_reg_coeff = 0.01 #Hyper-parameters for disjoint training self.train_task_ids = [] self.dev_task_ids = [] self.test_task_ids = [] self.raw_task_ids = [] self.disjoint = True self.datasets = {} self.tasks_metrics = {} self.HP_tasks_weight_decays = [0] def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) for idtask in self.label_alphabet: print(" Label alphabet size for task %s: %s" % (idtask, self.label_alphabet_sizes[idtask])) #print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Pretrained model : %s" % (self.pretrained_model)) print(" Pretrained part : %s" % (self.pretrained_part)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char extractor: %s" % (self.char_feature_extractor)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print(" Hyper number of tasks: %s" % (self.HP_tasks)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush() def initial_feature_alphabets(self): for l in open(self.train_dir, 'r').readlines(): if not l.startswith("#") and not l.startswith("-BOS-"): items = l.strip("\n").split() break total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [self.HP_feature_default_size ] * self.feature_num #self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm'] def build_alphabet(self, input_file): sample_corpus = None in_lines = open(input_file, 'r').readlines() for line in in_lines: if line.upper().startswith( TREEBANK_LINE ): #Check the treebank this sentence comes from sample_corpus = "[" + line.upper().replace(TREEBANK_LINE, "").strip() + "]" elif len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] if self.HP_tasks > 1 or not self.disjoint: #self.task_config[sample_corpus]["nb_tasks"] > 1: label = parse_multitask_label(label) else: label = [label] if len(label) != len( self.label_alphabet) and not self.disjoint: raise ValueError( "The number of tasks and the number of labels in the output column do not match" ) init_label_alp_index = 0 if not self.disjoint else self.task_config[ sample_corpus]["idstask"] for idtask, l in enumerate(label, init_label_alp_index): #for idtask, l in enumerate(label): self.label_alphabet[idtask].add(l) self.word_alphabet.add(word) for idx in range(self.feature_num): feat_idx = pairs[idx + 1].split(']', 1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() for idtask in self.label_alphabet: self.label_alphabet_sizes[idtask] = self.label_alphabet[ idtask].size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[ idx].size() for idtask in self.label_alphabet: startS = False startB = False for label, _ in self.label_alphabet[idtask].iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() for idtask in self.label_alphabet: self.label_alphabet[idtask].close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.task_config if self.disjoint else None) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, predict_results, name, indexes=None): fout = open(self.decode_dir, 'w') content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) for task_predict_results in predict_results: sent_num = len(task_predict_results) assert (sent_num == len(content_list)) for idx in range(sent_num): if indexes is not None and idx not in indexes: continue sent_length = len( predict_results[0] [idx]) #Index 0 to know the length of the input sentence for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] inputs = [] for id_input in range(len(content_list[idx]) - 2): if content_list[idx][id_input][0] != []: if type(content_list[idx][id_input][idy]) == type([]): for feature in content_list[idx][id_input][idy]: inputs.append(feature.encode('utf-8')) else: inputs.append(content_list[idx][id_input] [idy].encode('utf-8')) outputs = [] for task in predict_results: outputs.append(task[idx][idy]) fout.write("\t".join(inputs) + "\t" + "{}".join(outputs) + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir)) def load(self, data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self, save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) for idtask_predict_results, task_predict_results in enumerate( predict_results): sent_num = len(task_predict_results) assert (sent_num == len(content_list)) for idx in range(sent_num): score_string = "# " for idtask_predict_results, task_predict_results in enumerate( predict_results): sent_length = len(task_predict_results[idx][0]) nbest = len(task_predict_results[0]) #Printing the probabilities for idz in range(nbest): score_string += format( pred_scores[idtask_predict_results][idx][idz], '.4f') + " " fout.write(score_string.strip() + "\t") fout.write("\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + "\t" for ifeat in range(len(content_list[idx][1][idy])): label_string += content_list[idx][1][idy][ifeat].encode( 'utf-8') + "\t" for idtask_predict_results, task_predict_results in enumerate( predict_results): for idz in range(nbest): label_string += task_predict_results[idx][idz][ idy] + "," label_string = label_string.strip().strip(",") + "{}" fout.write(label_string) fout.write('\n') fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir)) def read_config(self, config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict the_item = 'feature_default_size' if the_item in config: self.HP_feature_default_size = int(config[the_item]) ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item]) #Hyperparameters for auxiliary tasks over the same treebank the_item = 'disjoint' if the_item in config: self.disjoint = str2bool(config[the_item]) if not self.disjoint: the_item = 'tasks' if the_item in config: self.HP_tasks = int(config[the_item]) if self.HP_tasks > 1: self.label_alphabet = { idtask: Alphabet('label', True) for idtask in range(self.HP_tasks) } self.label_alphabet_sizes = { idtask: self.label_alphabet[idtask].size() for idtask in range(self.HP_tasks) } the_item = "main_tasks" if the_item in config: self.HP_main_tasks = int(config[the_item]) print self.HP_main_tasks, self.HP_tasks if self.HP_main_tasks > self.HP_tasks: raise ValueError( "HP_main_tasks cannot be greater than HP_tasks") the_item = 'tasks_weights' if the_item in config: self.HP_tasks_weights = map(float, config[the_item].split("|")) else: #Hyperparameters for auxiliary tasks over a different treebank the_item = 'dataset' if the_item in config: self.task_config = config[the_item] ## feat_config is a dict self.HP_tasks = sum([ self.task_config[idtask]["nb_tasks"] for idtask in self.task_config ]) self.HP_main_tasks = sum([ self.task_config[idtask]["nb_tasks"] for idtask in self.task_config if self.task_config[idtask]["main"] ]) self.label_alphabet = { idtask: Alphabet('label', True) for idtask in range(self.HP_tasks) } self.label_alphabet_sizes = { idtask: self.label_alphabet[idtask].size() for idtask in range(self.HP_tasks) } self.HP_tasks_weights = [] self.HP_tasks_weight_decays = [] for idtask in self.task_config: for weight in self.task_config[idtask]["weight"]: self.HP_tasks_weights.append(weight) if "weight_decay" in self.task_config[idtask]: for weight_decay in self.task_config[idtask][ "weight_decay"]: self.HP_tasks_weight_decays.append(weight_decay) else: for j in range(self.task_config[idtask]["nb_tasks"]): self.HP_tasks_weight_decays.append(0) self.dataset_ids = { treebank: range( self.task_config[treebank]["idstask"], self.task_config[treebank]["idstask"] + self.task_config[treebank]["nb_tasks"]) for id, treebank in enumerate(self.task_config) } self.ignore_after_epoch = { treebank: self.task_config[treebank]["ignore_after_epoch"] if "ignore_after_epoch" in self.task_config[treebank] else self.HP_iteration + 1 for treebank in self.task_config } self.inv_dataset_ids = {} for tb in self.dataset_ids: for subtask in self.dataset_ids[tb]: self.inv_dataset_ids[subtask] = tb self.task_metric = {} for dataset in self.task_config: for i in range( self.task_config[dataset]["idstask"], self.task_config[dataset]["idstask"] + self.task_config[dataset]["nb_tasks"]): if "metric" in self.task_config[dataset]: self.task_metric[i] = self.task_config[dataset][ "metric"] the_item = 'evaluate' if the_item in config: self.evaluate = config[the_item] the_item = "gold_dev_trees" if the_item in config: self.gold_dev_trees = config[the_item] the_item = "gold_dev_dep" if the_item in config: self.gold_dev_dep = config[the_item] the_item = "combine_dependency_offset" if the_item in config: self.offset = str2bool(config[the_item]) the_item = "pretrained_model" if the_item in config: self.pretrained_model = config[the_item] the_item = "pretrained_part" if the_item in config: if config[the_item].lower() not in [ self.PRETRAINED_ALL, self.PRETRAINED_LSTMS ]: raise ValueError( "Invalidad value for pretrained_part (must be 'all' or 'lstms' " ) self.pretrained_part = config[the_item] the_item = "optimize_with_las" if the_item in config: self.optimize_with_las = str2bool(config[the_item]) the_item = "gold_train_trees" if the_item in config: self.gold_train_trees = config[the_item]
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 230 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.norm_word_emb = True self.norm_biword_emb = True self.norm_gaz_emb = False self.word_alphabet = Alphabet('word') self.biword_alphabet = Alphabet('biword') self.char_alphabet = Alphabet('character') # self.word_alphabet.add(START) # self.word_alphabet.add(UNKNOWN) # self.char_alphabet.add(START) # self.char_alphabet.add(UNKNOWN) # self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label', True) self.gaz_lower = False self.gaz = Gazetteer(self.gaz_lower) self.gaz_alphabet = Alphabet('gaz') self.HP_fix_gaz_emb = False self.HP_use_gaz = True self.tagScheme = "BMES" self.char_features = "LSTM" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.use_bigram = False self.word_emb_dim = 50 self.biword_emb_dim = 50 self.char_emb_dim = 50 self.gaz_emb_dim = 50 self.gaz_dropout = 0.5 self.pretrain_word_embedding = None self.pretrain_biword_embedding = None self.pretrain_gaz_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.biword_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 # hyperparameters self.HP_iteration = 100 self.HP_batch_size = 1 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Use bigram: %s" % (self.use_bigram)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Biword alphabet size: %s" % (self.biword_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Gaz alphabet size: %s" % (self.gaz_alphabet.size())) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Biword embedding size: %s" % (self.biword_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Gaz embedding size: %s" % (self.gaz_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm biword emb: %s" % (self.norm_biword_emb)) print(" Norm gaz emb: %s" % (self.norm_gaz_emb)) print(" Norm gaz dropout: %s" % (self.gaz_dropout)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara iteration: %s" % (self.HP_iteration)) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print(" Hyperpara use_gaz: %s" % (self.HP_use_gaz)) print(" Hyperpara fix gaz emb: %s" % (self.HP_fix_gaz_emb)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) if self.HP_use_char: print(" Char_features: %s" % (self.char_features)) print("DATA SUMMARY END.") sys.stdout.flush() def refresh_label_alphabet(self, input_file): old_size = self.label_alphabet_size self.label_alphabet.clear(True) in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() label = pairs[-1] self.label_alphabet.add(label) self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" self.fix_alphabet() print("Refresh label alphabet finished: old:%s -> new:%s" % (old_size, self.label_alphabet_size)) def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for idx in xrange(len(in_lines)): line = in_lines[idx] if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) # 获取label label = pairs[-1] # 安装出现顺序添加 self.label_alphabet.add(label) self.word_alphabet.add(word) if idx < len(in_lines) - 1 and len(in_lines[idx + 1]) > 2: biword = word + in_lines[ idx + 1].strip().split()[0].decode('utf-8') else: biword = word + NULLKEY self.biword_alphabet.add(biword) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.biword_alphabet_size = self.biword_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False # 判断是否属于BIO,BMES,BIOES其中一�? for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: # 如果有S则为BMES或BIOES self.tagScheme = "BMES" else: # 没有则为BIO self.tagScheme = "BIO" def build_gaz_file(self, gaz_file): # build gaz file,initial read gaz embedding file if gaz_file: fins = open(gaz_file, 'r').readlines() for fin in fins: fin = fin.strip().split()[0].decode('utf-8') if fin: self.gaz.insert(fin, "one_source") print "Load gaz file: ", gaz_file, " total size:", self.gaz.size() else: print "Gaz file is None, load nothing" def build_gaz_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() word_list = [] for line in in_lines: if len(line) > 3: word = line.split()[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) word_list.append(word) else: w_length = len(word_list) for idx in range(w_length): matched_entity = self.gaz.enumerateMatchList( word_list[idx:]) for entity in matched_entity: # print entity, self.gaz.searchId(entity),self.gaz.searchType(entity) self.gaz_alphabet.add(entity) word_list = [] print "gaz alphabet size:", self.gaz_alphabet.size() def fix_alphabet(self): self.word_alphabet.close() self.biword_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() self.gaz_alphabet.close() def build_word_pretrain_emb(self, emb_path): print "build word pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_radical_pretrain_emb(self, emb_path): print "build radical pretrain emb..." self.pretrain_word_embedding, self.word_emb_dim = build_radical_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) def build_biword_pretrain_emb(self, emb_path): print "build biword pretrain emb..." self.pretrain_biword_embedding, self.biword_emb_dim = build_pretrain_embedding( emb_path, self.biword_alphabet, self.biword_emb_dim, self.norm_biword_emb) def build_gaz_pretrain_emb(self, emb_path): print "build gaz pretrain emb..." self.pretrain_gaz_embedding, self.gaz_emb_dim = build_pretrain_embedding( emb_path, self.gaz_alphabet, self.gaz_emb_dim, self.norm_gaz_emb) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_seg_instance( input_file, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def generate_instance_with_gaz(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance_with_gaz( input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name)) def write_decoded_results(self, output_file, predict_results, name): fout = open(output_file, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): # content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, output_file))
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 250 self.MAX_WORD_LENGTH = -1 self.number_normalized = True self.norm_word_emb = False self.norm_char_emb = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.feature_name = [] self.feature_alphabets = [] self.feature_num = len(self.feature_alphabets) self.feat_config = None self.label_alphabet = Alphabet('label',True) self.tagScheme = "NoSeg" ## BMES/BIO self.seg = True ### I/O self.train_dir = None self.dev_dir = None self.test_dir = None self.raw_dir = None self.decode_dir = None self.dset_dir = None ## data vocabulary related file self.model_dir = None ## model save file self.load_model_dir = None ## model load file self.word_emb_dir = None self.char_emb_dir = None self.feature_emb_dirs = [] self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.pretrain_word_embedding = None self.pretrain_char_embedding = None self.pretrain_feature_embeddings = [] self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 self.feature_alphabet_sizes = [] self.feature_emb_dims = [] self.norm_feature_embs = [] self.word_emb_dim = 50 self.char_emb_dim = 30 ###Networks self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ self.use_char = True self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None self.use_crf = True self.nbest = None ## Training self.average_batch_loss = False self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" self.status = "train" ### Hyperparameters self.HP_cnn_layer = 4 self.HP_iteration = 100 self.HP_batch_size = 10 self.HP_char_hidden_dim = 50 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0.05 self.HP_clip = None self.HP_momentum = 0 self.HP_l2 = 1e-8 def show_data_summary(self): print("++"*50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s"%(self.tagScheme)) print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) print(" Number normalized: %s"%(self.number_normalized)) print(" Word alphabet size: %s"%(self.word_alphabet_size)) print(" Char alphabet size: %s"%(self.char_alphabet_size)) print(" Label alphabet size: %s"%(self.label_alphabet_size)) print(" Word embedding dir: %s"%(self.word_emb_dir)) print(" Char embedding dir: %s"%(self.char_emb_dir)) print(" Word embedding size: %s"%(self.word_emb_dim)) print(" Char embedding size: %s"%(self.char_emb_dim)) print(" Norm word emb: %s"%(self.norm_word_emb)) print(" Norm char emb: %s"%(self.norm_char_emb)) print(" Train file directory: %s"%(self.train_dir)) print(" Dev file directory: %s"%(self.dev_dir)) print(" Test file directory: %s"%(self.test_dir)) print(" Raw file directory: %s"%(self.raw_dir)) print(" Dset file directory: %s"%(self.dset_dir)) print(" Model file directory: %s"%(self.model_dir)) print(" Loadmodel directory: %s"%(self.load_model_dir)) print(" Decode file directory: %s"%(self.decode_dir)) print(" Train instance number: %s"%(len(self.train_texts))) print(" Dev instance number: %s"%(len(self.dev_texts))) print(" Test instance number: %s"%(len(self.test_texts))) print(" Raw instance number: %s"%(len(self.raw_texts))) print(" FEATURE num: %s"%(self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print(" Fe: %s embedding dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print(" Fe: %s embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" "+"++"*20) print(" Model Network:") print(" Model use_crf: %s"%(self.use_crf)) print(" Model word extractor: %s"%(self.word_feature_extractor)) print(" Model use_char: %s"%(self.use_char)) if self.use_char: print(" Model char extractor: %s"%(self.char_feature_extractor)) print(" Model char_hidden_dim: %s"%(self.HP_char_hidden_dim)) print(" "+"++"*20) print(" Training:") print(" Optimizer: %s"%(self.optimizer)) print(" Iteration: %s"%(self.HP_iteration)) print(" BatchSize: %s"%(self.HP_batch_size)) print(" Average batch loss: %s"%(self.average_batch_loss)) print(" "+"++"*20) print(" Hyperparameters:") print(" Hyper lr: %s"%(self.HP_lr)) print(" Hyper lr_decay: %s"%(self.HP_lr_decay)) print(" Hyper HP_clip: %s"%(self.HP_clip)) print(" Hyper momentum: %s"%(self.HP_momentum)) print(" Hyper l2: %s"%(self.HP_l2)) print(" Hyper hidden_dim: %s"%(self.HP_hidden_dim)) print(" Hyper dropout: %s"%(self.HP_dropout)) print(" Hyper lstm_layer: %s"%(self.HP_lstm_layer)) print(" Hyper bilstm: %s"%(self.HP_bilstm)) print(" Hyper GPU: %s"%(self.HP_gpu)) print("DATA SUMMARY END.") print("++"*50) sys.stdout.flush() def initial_feature_alphabets(self): items = open(self.train_dir,'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column-1): feature_prefix = items[idx].split(']',1)[0]+"]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print "Find feature: ", feature_prefix self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None]*self.feature_num self.feature_emb_dims = [20]*self.feature_num self.feature_emb_dirs = [None]*self.feature_num self.norm_feature_embs = [False]*self.feature_num self.feature_alphabet_sizes = [0]*self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm'] # exit(0) def build_alphabet(self, input_file): in_lines = open(input_file,'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0].decode('utf-8') if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) ## build feature alphabet for idx in range(self.feature_num): feat_idx = pairs[idx+1].split(']',1)[-1] self.feature_alphabets[idx].add(feat_idx) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() for idx in range(self.feature_num): self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size() startS = False startB = False for label,_ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() for idx in range(self.feature_num): self.feature_alphabets[idx].close() def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s"%(name, self.decode_dir)) def load(self,data_file): f = open(data_file, 'rb') tmp_dict = pickle.load(f) f.close() self.__dict__.update(tmp_dict) def save(self,save_file): f = open(save_file, 'wb') pickle.dump(self.__dict__, f, 2) f.close() def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir,'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") assert(sent_num == len(content_list)) assert(sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f')+" " fout.write(score_string.strip() + "\n") for idy in range(sent_length): label_string = content_list[idx][0][idy].encode('utf-8') + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy]+" " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir)) def read_config(self,config_file): config = config_file_to_dict(config_file) ## read data: the_item = 'train_dir' if the_item in config: self.train_dir = config[the_item] the_item = 'dev_dir' if the_item in config: self.dev_dir = config[the_item] the_item = 'test_dir' if the_item in config: self.test_dir = config[the_item] the_item = 'raw_dir' if the_item in config: self.raw_dir = config[the_item] the_item = 'decode_dir' if the_item in config: self.decode_dir = config[the_item] the_item = 'dset_dir' if the_item in config: self.dset_dir = config[the_item] the_item = 'model_dir' if the_item in config: self.model_dir = config[the_item] the_item = 'load_model_dir' if the_item in config: self.load_model_dir = config[the_item] the_item = 'word_emb_dir' if the_item in config: self.word_emb_dir = config[the_item] the_item = 'char_emb_dir' if the_item in config: self.char_emb_dir = config[the_item] the_item = 'MAX_SENTENCE_LENGTH' if the_item in config: self.MAX_SENTENCE_LENGTH = int(config[the_item]) the_item = 'MAX_WORD_LENGTH' if the_item in config: self.MAX_WORD_LENGTH = int(config[the_item]) the_item = 'norm_word_emb' if the_item in config: self.norm_word_emb = str2bool(config[the_item]) the_item = 'norm_char_emb' if the_item in config: self.norm_char_emb = str2bool(config[the_item]) the_item = 'number_normalized' if the_item in config: self.number_normalized = str2bool(config[the_item]) the_item = 'seg' if the_item in config: self.seg = str2bool(config[the_item]) the_item = 'word_emb_dim' if the_item in config: self.word_emb_dim = int(config[the_item]) the_item = 'char_emb_dim' if the_item in config: self.char_emb_dim = int(config[the_item]) ## read network: the_item = 'use_crf' if the_item in config: self.use_crf = str2bool(config[the_item]) the_item = 'use_char' if the_item in config: self.use_char = str2bool(config[the_item]) the_item = 'word_seq_feature' if the_item in config: self.word_feature_extractor = config[the_item] the_item = 'char_seq_feature' if the_item in config: self.char_feature_extractor = config[the_item] the_item = 'nbest' if the_item in config: self.nbest = int(config[the_item]) the_item = 'feature' if the_item in config: self.feat_config = config[the_item] ## feat_config is a dict ## read training setting: the_item = 'optimizer' if the_item in config: self.optimizer = config[the_item] the_item = 'ave_batch_loss' if the_item in config: self.average_batch_loss = str2bool(config[the_item]) the_item = 'status' if the_item in config: self.status = config[the_item] ## read Hyperparameters: the_item = 'cnn_layer' if the_item in config: self.HP_cnn_layer = int(config[the_item]) the_item = 'iteration' if the_item in config: self.HP_iteration = int(config[the_item]) the_item = 'batch_size' if the_item in config: self.HP_batch_size = int(config[the_item]) the_item = 'char_hidden_dim' if the_item in config: self.HP_char_hidden_dim = int(config[the_item]) the_item = 'hidden_dim' if the_item in config: self.HP_hidden_dim = int(config[the_item]) the_item = 'dropout' if the_item in config: self.HP_dropout = float(config[the_item]) the_item = 'lstm_layer' if the_item in config: self.HP_lstm_layer = int(config[the_item]) the_item = 'bilstm' if the_item in config: self.HP_bilstm = str2bool(config[the_item]) the_item = 'gpu' if the_item in config: self.HP_gpu = str2bool(config[the_item]) the_item = 'learning_rate' if the_item in config: self.HP_lr = float(config[the_item]) the_item = 'lr_decay' if the_item in config: self.HP_lr_decay = float(config[the_item]) the_item = 'clip' if the_item in config: self.HP_clip = float(config[the_item]) the_item = 'momentum' if the_item in config: self.HP_momentum = float(config[the_item]) the_item = 'l2' if the_item in config: self.HP_l2 = float(config[the_item])
class Data: def __init__(self): self.MAX_SENTENCE_LENGTH = 512 self.MAX_WORD_LENGTH = -1 self.number_normalized = False self.word_alphabet = Alphabet('word') self.char_alphabet = Alphabet('character') self.word_alphabet.add(START) self.word_alphabet.add(UNKNOWN) self.char_alphabet.add(START) self.char_alphabet.add(UNKNOWN) self.char_alphabet.add(PADDING) self.label_alphabet = Alphabet('label') self.tagScheme = "NoSeg" self.train_texts = [] self.dev_texts = [] self.test_texts = [] self.raw_texts = [] self.train_Ids = [] self.dev_Ids = [] self.test_Ids = [] self.raw_Ids = [] self.word_emb_dim = 50 self.pretrain_word_embedding = None self.label_size = 0 self.word_alphabet_size = 0 self.char_alphabet_size = 0 self.label_alphabet_size = 0 ### hyperparameters self.HP_batch_size = 10 self.HP_hidden_dim = 200 self.HP_dropout = 0.5 self.HP_lstm_layer = 1 self.HP_bilstm = True self.HP_use_char = True self.HP_gpu = False self.HP_lr = 0.015 self.HP_lr_decay = 0 self.HP_clip = 5.0 self.HP_momentum = 0 def show_data_summary(self): print("DATA SUMMARY START:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" Hyperpara batch size: %s" % (self.HP_batch_size)) print(" Hyperpara lr: %s" % (self.HP_lr)) print(" Hyperpara lr_decay: %s" % (self.HP_lr_decay)) print(" Hyperpara HP_clip: %s" % (self.HP_clip)) print(" Hyperpara momentum: %s" % (self.HP_momentum)) print(" Hyperpara hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyperpara dropout: %s" % (self.HP_dropout)) print(" Hyperpara lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyperpara bilstm: %s" % (self.HP_bilstm)) print(" Hyperpara use_char: %s" % (self.HP_use_char)) print(" Hyperpara GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") sys.stdout.flush() def build_alphabet(self, input_file): in_lines = open(input_file, 'r').readlines() for line in in_lines: if len(line) > 2: pairs = line.strip().split() word = pairs[0] if self.number_normalized: word = normalize_word(word) label = pairs[-1] self.label_alphabet.add(label) self.word_alphabet.add(word) for char in word: self.char_alphabet.add(char) self.word_alphabet_size = self.word_alphabet.size() self.char_alphabet_size = self.char_alphabet.size() self.label_alphabet_size = self.label_alphabet.size() startS = False startB = False for label, _ in self.label_alphabet.iteritems(): if "S-" in label.upper(): startS = True elif "B-" in label.upper(): startB = True if startB: if startS: self.tagScheme = "BMES" else: self.tagScheme = "BIO" def fix_alphabet(self): self.word_alphabet.close() self.char_alphabet.close() self.label_alphabet.close() def build_word_pretrain_emb(self, emb_path, norm=False): self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( emb_path, self.word_alphabet, self.word_emb_dim, norm) def generate_instance(self, input_file, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( input_file, self.word_alphabet, self.char_alphabet, self.label_alphabet, self.number_normalized, self.MAX_WORD_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name))