def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False, allow_unk=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) model.build() # train, evaluate and interact model.train(train, dev, vocab_tags) model.evaluate(test, vocab_tags) model.interactive_shell(vocab_tags, processing_word)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = Data(config.dev_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) test = Data(config.test_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) train = Data(config.train_filename, processing_word, processing_mor_tag, processing_lex_tag, processing_tag, config.max_iter) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.train(train, dev, vocab_tags) cnn_model.evaluate(test, vocab_tags)
def doeval(): parser = argparse.ArgumentParser(description='Text CNN 分类器') #必须指定已经训练好的模型 parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行评估') conf = Config() #打印模型配置信息 conf.dump() args = parser.parse_args() print("加载测试数据") #测试时不进行数据打乱操作 eval_iter, text_field, label_field = data_utils.text_dataloader( conf.eval_dir, conf.batch_size, shuffle=False) # 模型加载和初始化 if os.path.exists(args.model): print('发现模型文件, 加载模型: {}'.format(args.model)) cnn = torch.load(args.model) else: print("未找到模型文件,退出") sys.exit(-1) #加载以保存的字典 text_field.vocab = data_utils.load_vocab("model/text.vocab") label_field.vocab = data_utils.load_vocab("model/label.vocab") #开始模型评估 model_utils.eval(eval_iter, cnn, conf)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_mor_tags = load_vocab(config.mor_tags_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_lex_tags = load_vocab(config.lex_tags_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_mor_tag = get_processing_word(vocab_mor_tags, lowercase=False) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_lex_tag = get_processing_word(vocab_lex_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) cnn_model = CnnLstmCrfModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars)) cnn_model.build() cnn_model.write_tag_result_test(vocab_tags, processing_word, processing_mor_tag, processing_lex_tag)
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) #test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_pos = get_vocabs([train, dev]) vocab_glove = get_glove_vocab(config.glove_filename) vocab_glove_uni = get_glove_vocab(config.glove_uni_filename) vocab_feature = get_pos_glove_vocab(config.glove_filename) # vocab = vocab_words & vocab_glove vocab = vocab_glove | vocab_words vocab.add(UNK) vocab.add(NUM) vocab_pos = vocab_feature vocab_pos.add(UNK) vocab_pos.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_glove_uni, config.uni_words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_pos, config.pos_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.t_dim) vocab = load_vocab(config.uni_words_filename) export_trimmed_uni_vectors(vocab, config.NEdic_filename, config.trimmed_dic, config.dic_dim) export_trimmed_uni_vectors(vocab, config.glove_uni_filename, config.uni_trimmed_filename, config.dim) vocab_feature = load_vocab(config.pos_filename) export_trimmed_pos_vectors(vocab_feature, config.glove_feature, config.feature_trimmed_filename, config.pos_dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=True) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = AnnotationDataset(config.dev_filename, processing_word) test = AnnotationDataset(config.test_filename, processing_word) train = AnnotationDataset(config.train_filename, processing_word) print("Num. train: %d" % len(train)) print("Num. test: %d" % len(test)) print("Num. dev: %d" % len(dev)) model = WImpModel(config, embeddings, ntags=config.nclass, nchars=len(vocab_chars)) # build WImpModel model.build_graph() # train, evaluate and interact model.train(train, dev) model.evaluate(test)
def chat(question): """ In test mode, we don"t to create the backward path. """ _, enc_vocab = data_utils.load_vocab( os.path.join(config.DATA_PATH, "vocab.enc")) # `inv_dec_vocab` <type "list">: id2word. inv_dec_vocab, _ = data_utils.load_vocab( os.path.join(config.DATA_PATH, "vocab.dec")) model = ChatBotModel(True, batch_size=1) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_restore_parameters(sess, saver) output_file = open(os.path.join(config.DATA_PATH, config.TERMINAL_OUTPUT), "a+", encoding="utf-8") # Decode from standard input. max_length = config.BUCKETS[-1][0] print( "Welcome to TensorBro. Say something. Enter to exit. Max length is", max_length) line = question if hasattr(line, "decode"): # If using Python 2 # FIXME: UnicodeError when deleting Chinese in terminal. line = line.decode("utf-8") if len(line) > 0 and line[-1] == "\n": line = line[:-1] if not line: pass output_file.write("HUMAN ++++ " + line + "\n") # Get token-ids for the input sentence. token_ids = data_utils.sentence2id(enc_vocab, line) if len(token_ids) > max_length: print("Max length I can handle is:", max_length) # line = _get_user_input() pass # Which bucket does it belong to? bucket_id = find_right_bucket(len(token_ids)) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, decoder_masks = data_utils.get_batch( [(token_ids, [])], bucket_id, batch_size=1) # Get output logits for the sentence. _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, True) response = construct_response(output_logits, inv_dec_vocab) print(response) output_file.write("BOT ++++ " + response + "\n") output_file.write("=============================================\n") output_file.close()
def __init__(self, config): self.config = config self.vocab_words = load_vocab(self.config.filename_words) self.vocab_tags = load_vocab(self.config.filename_tags) self.vocab_chars = load_vocab(self.config.filename_chars) # Get pre-trained embeddings self.w_embeddings = (get_trimmed_glove_vectors(config.filename_trimmed) if self.config.use_pretrained else None)
def main(config): # load vocabs vocab_words, idx2words = load_vocab(config.words_filename) vocab_tags, _ = load_vocab(config.tags_filename) vocab_chars, _ = load_vocab(config.chars_filename) vocab_pos, _ = load_vocab(config.pos_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_pos = get_processing_word(vocab_pos, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename) pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename) NE_dic = get_trimmed_glove_vectors(config.trimmed_dic) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_pos, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_pos, config.max_iter) # build model model = NERModel(config, embeddings, embeddings_uni, pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words, NE_dic=NE_dic) model.build() # train, evaluate and interact if state == "train": model.train(train, dev, vocab_tags) elif state == "evaluate": model.evaluate(dev, vocab_tags) else: #state == predict convert(file) t2o("data_format/test_convert.txt","data_format/test.txt") test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_pos, config.max_iter) model.evaluate(test, vocab_tags) tagging("data_format/test_convert.txt")
class nlu(): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) # get processing functions embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # get logger # logger = get_logger(config.log_path) # build model model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None) model.build() idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()} saver = tf.train.Saver() sess = tf.Session() saver.restore(sess, config.model_output) # model.logger.info("This is an interactive mode, enter a sentence:") @staticmethod def rec(sentence): try: processing_word = get_processing_word(nlu.vocab_words, lowercase=config.lowercase) # print character_separation(sentence)[0] words_raw = character_separation(sentence)[0].split(' ') # for word in words_raw: # if type(word)==str: words_raw = [unicode(word, 'utf-8') for word in words_raw] # words_raw = [word.decode('utf-8') for word in words_raw] # else: # words_raw = [unicode(word, 'utf-8') for word in words_raw] words = map(processing_word, words_raw) words = list(words) pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words]) preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0])) # print(list(preds)) print_sentence(nlu.model.logger, {"x": words_raw, "y": preds}) return list(preds) except EOFError: print("Closing session.") # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花') # nlu.rec('请播放电视剧三生三世十里桃花')
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)
def __init__(self, args): super().__init__() train_file = args.train_file vocab_file = args.vocab_file train_sens = data_utils.load_sentences(train_file, skip_invalid=True) word2id, id2word, label2id, id2label = data_utils.load_vocab( train_sens, vocab_file) data_utils.gen_ids(train_sens, word2id, label2id, 100) train_full_tensors = data_utils.make_full_tensors(train_sens) raw_x = train_full_tensors[0] x_length = train_full_tensors[1] x_labels = train_full_tensors[2] raw_f = lambda t: id2label[t] x_labels_true = np.array(list(map(raw_f, x_labels))) n_train = int(len(raw_x) * 1) self.train_x, self.test_x = raw_x[:n_train], raw_x[n_train:] self.train_length_x, self.test_length_x = x_length[:n_train], x_length[ n_train:] self.train_y, self.test_y = x_labels[:n_train], x_labels[n_train:] self.gt_label = x_labels_true self.raw_q = ["".join(i.raw_tokens) for i in train_sens]
def train(): checkpoint = "../model/checkpoint/model.ckpt" data_utils.prepare() index_to_char, char_to_index, vocab_size = data_utils.load_vocab() epochs = 100 with tf.Session() as sess: model = attention_seq2seq(vocab_size) model.build_model() sess.run(tf.global_variables_initializer()) train_summary = tf.summary.FileWriter('../model/summary/', graph=sess.graph) for epoch in range(1, epochs + 1): train_set = data_utils.train_set(char_to_index) for source_seq, target_seq in train_set: encoder_inputs, encoder_inputs_length, decoder_inputs, decoder_inputs_length = data_utils.prepare_train_batch( source_seq, target_seq) _, loss = model.train( sess=sess, encoder_inputs=encoder_inputs, encoder_inputs_length=encoder_inputs_length, decoder_targets=decoder_inputs, decoder_inputs_length=decoder_inputs_length) print("epoch={}, loss={}".format(epoch, loss)) model.merge(sess, train_summary, epoch) saver = tf.train.Saver() saver.save(sess, save_path=checkpoint) print('Model Trained and Saved')
def load(self): self.vocab_tags = load_vocab(self.filename_tags) self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False) self.ntags = len(self.vocab_tags) self.early_stop_metric_sign = -1 if self.stop_direction == 'increase' else 1
def get_optimal_set(K, vocab_file, corpus_dir, polar_seed_file): def get_set(polar_seed, K): return polar_seed[:K], polar_seed[-K:] if os.path.exists(polar_seed_file): polar_seed = [] for line in open(polar_seed_file): ps = line.split('\t') polar_seed.append(int(ps[1])) return get_set(polar_seed, K) vocab2idx, vocab_str, vocab_count = data_utils.load_vocab(vocab_file) vocab_size = len(vocab_count) for i, c in enumerate(vocab_count): if c < 5000: vocab_size = i break vocab_str = vocab_str[:vocab_size] corpus = data_utils.load_news_corpus(corpus_dir) corpus = process_corpus(corpus) polar_seed = get_polar_seed(corpus, vocab_size, vocab2idx) print 'Saving polar seed in file %s' % polar_seed_file with open(polar_seed_file, 'w') as fout: for i, polar in polar_seed: fout.write("%s\t%d\t%f\n" % (vocab_str[i], i, polar)) return get_set(polar_seed, K)
def generate(): random.seed(SEED) np.random.seed(SEED) vocab_dict, vocab_res = data_utils.load_vocab('./vocab.txt') data = data_utils.load_data('data.pkl') vocab_size = len(vocab_dict) SEQ_LENGTH = data.shape[1] generator = Generator(vocab_size, BATCH_SIZE, EMB_DIM, HIDDEN_DIM, SEQ_LENGTH, START_TOKEN) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) samples = generator.generate(sess) for i in range(int(1)): if i > len(samples): break arr = samples[i] poem = '' for index in arr: if index != data_utils.EOS_ID: poem += vocab_res[index] print(poem)
def train(self, epoch=25, batch_size=1, learning_rate=0.0002, momentum=0.9, decay=0.95, data_dir="data", dataset_name="cnn", vocab_size=1000000): if not self.vocab: self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size) self.opt = tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum) for epoch_idx in xrange(epoch): data_loader = load_dataset(data_dir, dataset_name, vocab_size) contexts, questions, answers = [], [], [] for batch_idx in xrange(batch_size): _, context, question, answer, _ = data_loader.next() contexts.append(context) questions.append(question) answers.append(answers)
def build_joint_vocab(config): # Common options for all datasets processing_word = get_processing_word(lowercase=True) vocab_glove = get_glove_vocab(config.filename_glove) # Compute and save individual vocab v1_words, v1_chars = get_conll2005_vocab(config.conll2005, processing_word, vocab_glove) v2_words, v2_chars = get_conll2003_vocab(config.conll2003, processing_word, vocab_glove) v3_words, v3_chars = get_semcor_vocab(config.semcor, processing_word, vocab_glove) print(" *** Joint vocabulary ***") vocab_words = v1_words.union(v2_words, v3_words) vocab_chars = v1_chars.union(v2_chars, v3_chars) # Save combined vocab write_vocab(vocab_words, config.filename_words) write_vocab(vocab_chars, config.filename_chars) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word)
def predit(): vocab_path = "../vocab.pickle" input_sentence = "不是" index_to_char, char_to_index, vocab_size = data_utils.load_vocab( vocab_path) form_input = [] for ch in input_sentence: try: ch = char_to_index[ch] form_input.append(ch) except KeyError: pass encoder_inputs, encoder_inputs_length = data_utils.prepare_predict_batch( [form_input]) checkpoint = "../model/checkpoint/model.ckpt-1" with tf.Session() as sess: model = attention_seq2seq(vocab_size=vocab_size, mode='decode') model.build_model() saver = tf.train.Saver() saver.restore(sess=sess, save_path=checkpoint) predicted_ids = model.predict( sess=sess, encoder_inputs=encoder_inputs, encoder_inputs_length=encoder_inputs_length) predicted_ids = predicted_ids[0].tolist() predicted_ids = predicted_ids[0] print(predicted_ids) temp = [ index_to_char[i] for i in predicted_ids if i != data_utils.end_token ] print(temp) print("".join(temp))
def save_polar_optimal(K, vocab_file, corpus_dir, polar_seed_file, polar_optimal_file): optimal_p, optimal_n = get_optimal_set(K, vocab_file, corpus_dir, polar_seed_file) corpus = data_utils.load_news_corpus(corpus_dir) corpus = process_corpus(corpus) vocab2idx, vocab_str, vocab_count = data_utils.load_vocab(vocab_file) vocab_size = len(vocab_count) for i, c in enumerate(vocab_count): if c < 500: vocab_size = i break vocab_str = vocab_str[:vocab_size] p_w = get_Pw(corpus, vocab_size) print 'Calculating porlar_optimal...' porlar_optimal = [0. for _ in xrange(vocab_size)] ppw, npw = get_set_Pw(corpus, optimal_p, optimal_n) for wi in tqdm(xrange(len(p_w))): porlar_optimal[wi] = polar(corpus, optimal_p, optimal_n, ppw, npw, wi, p_w[wi]) porlar_optimal = sorted(enumerate(porlar_optimal), key=lambda x: x[1], reverse=True) print 'Saving polar optimal in file %s' % polar_optimal_file with open(polar_optimal_file, 'w') as fout: for i, p in porlar_optimal: fout.write("%s\t%d\t%f\n" % (vocab_str[i], i, p))
def build_data(config): processing_word = get_processing_word() dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) vocab_words, vocab_tags, vocab_poss = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_poss, config.poss_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) vocab_iob = {"O": 0, "B": 1, "I": 2} vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3} # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) model = NERModel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=4) model.build() # train, evaluate and interact print vocab_tags model.train(train, dev, vocab_tags) stime = time.time() model.evaluate(test, vocab_tags) print time.time() - stime
def chat(): """ in test mode, we don't to create the backward path """ _, enc_vocab = data_utils.load_vocab( os.path.join(config.PROCESSED_PATH, 'vocab.enc')) inv_dec_vocab, _ = data_utils.load_vocab( os.path.join(config.PROCESSED_PATH, 'vocab.dec')) model = ChatBotModel(True, batch_size=1) model.build_graph() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) check_restore_parameters(sess, saver) output_file = open( '/Users/EleanorLeung/Documents/CITS4404/chatbot/output_convo.txt', 'a+') # Decode from standard input. max_length = config.BUCKETS[-1][0] print('Talk to me! Enter to exit. Max length is', max_length) while True: line = str.encode(get_user_input()) if len(line) > 0 and line[-1] == '\n': line = line[:-1] if line == '': break output_file.write('HUMAN: ' + str(line) + '\n') token_ids = data_utils.sentence2id(enc_vocab, line) if len(token_ids) > max_length: print('Max length I can handle is:', max_length) line = get_user_input() continue bucket_id = find_right_bucket(len(token_ids)) # Get a 1-element batch to feed the sentence to the model. encoder_inputs, decoder_inputs, decoder_masks = data_utils.get_batch( [(token_ids, [])], bucket_id, batch_size=1) # Get output logits for the sentence. _, _, output_logits = run_step(sess, model, encoder_inputs, decoder_inputs, decoder_masks, bucket_id, True) response = construct_response(output_logits, inv_dec_vocab) print(response) output_file.write('BOT: ' + response + '\n') output_file.write('=============================================\n') output_file.close()
def build_data(config, logger): """ Procedure to build data """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators test = CoNLLDataset(config.test_filename, processing_word) dev = CoNLLDataset(config.dev_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab print("Build Word and Tag vocab...") vocab_words, vocab_poss, vocab_chunks, \ vocab_aspect_tags, vocab_polarity_tags, vocab_joint_tags = get_vocabs([train, dev, test]) vocab = vocab_words vocab.add(UNK) vocab.add(NUM) # Save vocab print("Dealing words vocab...") write_vocab(vocab, config.words_filename) print("Dealing poss vocab...") write_vocab(vocab_poss, config.poss_filename) vocab_chunks = [tags for tags in vocab_chunks] if "NO" in vocab_chunks: vocab_chunks.remove("NO") vocab_chunks.insert(0, "NO") else: logger.error(">>> vocab_chunks used as mpqa has something wrong!") print("Dealing chunks vocab...") write_vocab(vocab_chunks, config.chunks_filename) vocab_aspect_tags = [tags for tags in vocab_aspect_tags] vocab_aspect_tags.remove("O") vocab_aspect_tags.insert(0, "O") vocab_polarity_tags = [tags for tags in vocab_polarity_tags] vocab_polarity_tags.remove("O") vocab_polarity_tags.insert(0, "O") vocab_joint_tags = [tags for tags in vocab_joint_tags] vocab_joint_tags.remove("O") vocab_joint_tags.insert(0, "O") print("Dealing aspect_tags vocab...") write_vocab(vocab_aspect_tags, config.aspect_tags_filename) print("Dealing polarity_tags vocab...") write_vocab(vocab_polarity_tags, config.polarity_tags_filename) print("Dealing joint_tags vocab...") write_vocab(vocab_joint_tags, config.joint_tags_filename) vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.domain_filename, config.domain_trimmed_filename, config.dim_domain) export_trimmed_glove_vectors(vocab, config.general_filename, config.general_trimmed_filename, config.dim_general)
def demo(): with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) as sess: # Create model and load parameters. model, _ = create_model(sess, forward_only=True) nl_vocab, _, _, rev_cm_vocab = data_utils.load_vocab(FLAGS) decode_tools.demo(sess, model, nl_vocab, rev_cm_vocab, FLAGS)
def manual_eval(num_eval): with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) as sess: # Create model and load parameters. _, model_sig = graph_utils.get_model_signature(FLAGS) _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS) _, dev_set, _ = load_data(use_buckets=False) eval_tools.manual_eval(model_sig, dev_set, rev_nl_vocab, FLAGS, FLAGS.model_dir, num_eval)
def eval(data_set, model_sig=None, verbose=True): with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) as sess: if model_sig is None: _, model_sig = graph_utils.get_model_signature(FLAGS) print("evaluate " + model_sig + "...") _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS) return eval_tools.eval_set(model_sig, data_set, rev_nl_vocab, FLAGS, verbose=verbose)
def load(self): """Loads vocabulary, processing functions and embeddings """ # 1. vocabulary self.vocab_words = load_vocab(self.filename_words) self.vocab_chars = load_vocab(self.filename_chars) self.nwords = len(self.vocab_words) self.nchars = len(self.vocab_chars) # 2. get processing functions that map str -> id self.processing_word = get_processing_word(self.vocab_words, self.vocab_chars, lowercase=True, chars=self.use_chars) # 3. get pre-trained embeddings self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) if self.use_pretrained else None)
def main(config): # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) dictionary = load_vocab("data/types.txt") types_dic = collections.OrderedDict([(v, k) for k, v in dictionary.items()]) vocab_iob = {"O":0, "B":1, "I":2} vocab_type = load_vocab(config.types_filename) print vocab_type # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) processing_iob = get_processing_word(vocab_iob, lowercase=False) processing_type = get_processing_word(vocab_type, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, processing_iob, processing_type, config.max_iter, config.chars) ntype = len(vocab_type) model = POSmodel(config, embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), niob=3, ntype=ntype) model.build() model.train(train, dev, vocab_type) model.evaluate(test, vocab_type)
def check_npz(): vocab = load_vocab("../data/words.txt") idx = vocab['硕士'] with open('../data/polyglot-zh.pkl', 'rb') as f: words, embeddings = pickle.load(f, encoding="latin1") words = list(words) embeddings = list(embeddings) word_idx = words.index('硕士') return (data[idx] == embeddings[word_idx])
def dopredict(): """ 给定一个文件或一句话,预测结果 :return: """ parser = argparse.ArgumentParser(description='Text CNN 分类器') #必须指定已经训练好的模型 parser.add_argument('--path', type=str, default="data/predict/", help='要进行预测的文本文件的路径,或文件夹') parser.add_argument('--model', type=str, default="model/textcnn.model", help='读取model进行预测') conf = Config() args = parser.parse_args() #指定Field格式 text_field = data_utils.TextTEXT label_field = data_utils.TextLABEL text_field.vocab = data_utils.load_vocab("model/text.vocab") label_field.vocab = data_utils.load_vocab("model/label.vocab") # 模型加载和初始化 if os.path.exists(args.model): print('发现模型文件, 加载模型: {}'.format(args.model)) cnn = torch.load(args.model) else: print("未找到模型文件,退出") sys.exit(-1) #如果是文件夹,那么预测里面的文件,否则就是文件,直接预测 if os.path.isdir(args.path): files = os.listdir(args.path) files_path = [args.path + f for f in files] else: files_path = [args.path] #开始预测 for file in files_path: text, label = model_utils.predict(file, cnn, text_field, label_field, conf.cuda) print('[path] {}\n[Text] {}\n[Label] {}\n'.format(file, text, label)) print(f'共预测{len(files_path)}个文件')
def decode(data_set, construct_model_dir=True, verbose=True): with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=FLAGS.log_device_placement)) as sess: # Create model and load parameters. model, _ = create_model(sess, forward_only=True, construct_model_dir=construct_model_dir) _, rev_nl_vocab, _, rev_cm_vocab = data_utils.load_vocab(FLAGS) decode_tools.decode_set(sess, model, data_set, rev_nl_vocab, rev_cm_vocab, FLAGS, verbose) return model.model_sig
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=True) processing_word = get_processing_word(lowercase=True) # clean data train_filepath, dev_filepath_a = write_clear_data( config.train_filename, build_dev=config.build_dev_from_trainset, dev_ratio=config.dev_ratio) test_filepath, dev_filepath_b = write_clear_data( config.test_filename, build_dev=config.build_dev_from_testset, dev_ratio=config.dev_ratio) dev_filepath = dev_filepath_a or dev_filepath_b # Generators dev = Dataset(dev_filepath, processing_word) test = Dataset(test_filepath, processing_word) train = Dataset(train_filepath, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = Dataset(train_filepath) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
def __init__(self, root_dir, config, debug=True): self.config = config self.test_set = tf.gfile.Glob(join(root_dir, "test", "*.question")) if debug: self.validation_set = tf.gfile.Glob( join(root_dir, "validation", "*.question")) else: self.training_set = tf.gfile.Glob( join(root_dir, "training", "*.question")) self.vocabulary, self.reverse_vocabulary = load_vocab( root_dir, str(Config.vocab_size - 2)) self.reverse_vocabulary = ['BAR_', 'UNK_'] + self.reverse_vocabulary self.pool = Pool(4) self.debug = debug
def prepare_model(self, data_dir, dataset_name, vocab_size): if not self.vocab: self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size) print(" [*] Loading vocab finished.") self.vocab_size = len(self.vocab) self.emb = tf.get_variable("emb", [self.vocab_size, self.size]) # inputs self.inputs = tf.placeholder(tf.int32, [self.batch_size, self.max_nsteps]) embed_inputs = tf.nn.embedding_lookup(self.emb, tf.transpose(self.inputs)) tf.histogram_summary("embed", self.emb) # output states _, states = rnn.rnn( self.stacked_cell, tf.unpack(embed_inputs), dtype=tf.float32, initial_state=self.initial_state ) self.batch_states = tf.pack(states) self.nstarts = tf.placeholder(tf.int32, [self.batch_size, 3]) outputs = tf.pack( [ tf.slice(self.batch_states, nstarts, [1, 1, self.output_size]) for idx, nstarts in enumerate(tf.unpack(self.nstarts)) ] ) self.outputs = tf.reshape(outputs, [self.batch_size, self.output_size]) self.W = tf.get_variable("W", [self.vocab_size, self.output_size]) tf.histogram_summary("weights", self.W) tf.histogram_summary("output", outputs) self.y = tf.placeholder(tf.float32, [self.batch_size, self.vocab_size]) self.y_ = tf.matmul(self.outputs, self.W, transpose_b=True) self.loss = tf.nn.softmax_cross_entropy_with_logits(self.y_, self.y) tf.scalar_summary("loss", tf.reduce_mean(self.loss)) correct_prediction = tf.equal(tf.argmax(self.y, 1), tf.argmax(self.y_, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) tf.scalar_summary("accuracy", self.accuracy) print(" [*] Preparing model finished.")
def build_data(config): """ Procedure to build data Args: config: defines attributes needed in the function Returns: creates vocab files from the datasets creates a npz embedding file from trimmed glove vectors """ processing_word = get_processing_word(lowercase=config.lowercase) # Generators dev = CoNLLDataset(config.dev_filename, processing_word) test = CoNLLDataset(config.test_filename, processing_word) train = CoNLLDataset(config.train_filename, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim) # Build and save char vocab train = CoNLLDataset(config.train_filename) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.chars_filename)
from data_utils import get_trimmed_glove_vectors, load_vocab, \ get_processing_word, CoNLLDataset from model import NERModel from config import Config # create instance of config config = Config() # load vocabs vocab_words = load_vocab(config.words_filename) vocab_tags = load_vocab(config.tags_filename) vocab_chars = load_vocab(config.chars_filename) # get processing functions processing_word = get_processing_word(vocab_words, vocab_chars, lowercase=True, chars=config.chars) processing_tag = get_processing_word(vocab_tags, lowercase=False) # get pre trained embeddings embeddings = get_trimmed_glove_vectors(config.trimmed_filename) # create dataset dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag, config.max_iter) test = CoNLLDataset(config.test_filename, processing_word, processing_tag, config.max_iter) train = CoNLLDataset(config.train_filename, processing_word, processing_tag, config.max_iter) # build model
vocab_words = set() vocab_tags = set() vocab_chars = set() file = open('data/all.txt') for line in file: line = line.strip() if len(line) == 0: continue token, tag = line.split(' ') print token, tag for c in token: vocab_chars.add(c) vocab_words.add(token) vocab_tags.add(tag) # Build Word and Tag vocab vocab_glove = get_glove_vocab(config.glove_filename) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocabs write_vocab(vocab, config.words_filename) write_vocab(vocab_tags, config.tags_filename) write_vocab(vocab_chars, config.chars_filename) # Trim GloVe Vectors vocab = load_vocab(config.words_filename) export_trimmed_glove_vectors(vocab, config.glove_filename, config.trimmed_filename, config.dim)