def main(_): converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, None, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) # start = converter.text_to_arr(FLAGS.seed_for_generating) seeds = [ 'var a = fun', 'function a(', 'this.', 'document.', 'window.', 'var a = document.g', 'var a;', 'jQuery' ] for seed in seeds: start = converter.text_to_arr(seed) for i in range(0, FLAGS.num_to_generate): print('Generating: ' + seed + ' -> ' + str(i)) file_name = str(uuid.uuid1()) file_path = '../../BrowserFuzzingData/generated/' + FLAGS.file_type + '/' + file_name + '.' + FLAGS.file_type arr = model.sample(FLAGS.max_length_of_generated, start, converter.vocab_size, converter.word_to_int) f = open(file_path, "wb") f.write(converter.arr_to_text(arr).encode('utf-8')) f.close()
def main(_): FLAGS.start_string = FLAGS.start_string #.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start_string = FLAGS.start_string sys.stdout.write("> ") sys.stdout.flush() start_string = sys.stdin.readline() while start_string: start = converter.text_to_arr(start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr)) sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline()
def main(_): model_path = os.path.join('model', FLAGS.file_type) if not os.path.exists(model_path): os.makedirs(model_path) # Read and Load Corpus for Train and Validation. training_corpus, validating_corpus = read_corpus() # Build Text Converter print( "---------------------------- Initializing Text Converter ----------------------------" ) start_time = time.time() converter = TextConverter(training_corpus, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) print('Initialize Text Converter Finished in %.3f Seconds.\n' % (time.time() - start_time)) # Vectorize Content of Corpus vectroize_corpus(converter) # Build Char RNN Model model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) # Train Model model.train(FLAGS.max_steps, model_path, FLAGS.validate_every_n_steps, FLAGS.log_every_n_steps)
def main(_): model_path = os.path.join('model', FLAGS.name) # 保存模型的路径 if os.path.exists(model_path) is False: os.makedirs(model_path) # 用codecs提供的open方法来指定打开的文件的语言编码,它会在读取的时候自动转换为内部unicode with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() # 读取训练的文本 converter = TextConverter(text, FLAGS.max_vocab) # 转换text文本格式 converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) # 转换text为数组 g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) # 批生成 print(converter.vocab_size) model = CharRNN(converter.vocab_size, # 读取模型 num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g, # 训练 FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('model', FLAGS.name) #print(model_path) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seq, FLAGS.num_step) print(converter.vocab_size) model = CharModel( converter.vocab_size, num_seq=FLAGS.num_seq, num_step=FLAGS.num_step, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, #learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, #use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size, is_Training=True) #model.add_placeholder() #model.build_lstm() #model.build_loss() #model.build_optimizer() model.train(g, FLAGS.max_steps, model_path)
def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('model', FLAGS.name)#创建路径字符串 if os.path.exists(model_path) is False:#创建文件夹路径 os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read()#读取整个文件作为字符串 converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text)#将文本序列化 g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps)#100,100 print(converter.vocab_size) model = CharRNN(converter.vocab_size,#创建模型,这里num_classes设置为了字典的大小,因为要预测下一个char num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g,#训练模型 FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('models', Config.file_name) converter = TextConverter(vocab_dir='data/vocabs', max_vocab=Config.vocab_size, seq_length=Config.seq_length) print('vocab lens:', converter.vocab_size) # 加载上一次保存的模型 model = Model(Config) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) while True: english_speek = input("上联:") english_speek = ' '.join(english_speek) english_speek = english_speek.split() en_arr, arr_len = converter.text_en_to_arr(english_speek) test_g = [np.array([ en_arr, ]), np.array([ arr_len, ])] output_ids = model.test(test_g, model_path, converter) strs = converter.arr_to_text(output_ids) print('下联:', strs)
def main(_): model_path = os.path.join('model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): script_path = os.path.abspath(os.path.dirname(__file__)) model_path = os.path.join(script_path, 'model', FLAGS.name) if os.path.exists(model_path) is False: os.makedirs(model_path) with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() print("corpus size " + str(len(text))) if os.path.exists(FLAGS.whitelist_file): with codecs.open(FLAGS.whitelist_file, encoding='utf-8') as f: whitelist = f.read() text = remove_non_matching_chars(text, whitelist) converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def main(_): model_path = os.path.join('models', FLAGS.file_name) if os.path.isdir(model_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path) converter = TextConverter( filename=os.path.join(model_path, 'converter.pkl')) # QAs = converter.load_obj(filename=os.path.join(model_path, 'QAs.pkl')) QAs, text = load_origin_data('data/task3_dev.txt') testQAs_to_arrs = converter.testQAs_to_arrs(QAs, FLAGS.num_steps) test_samples = testQAs_to_arrs print('use embeding:', FLAGS.use_embedding) print('vocab size:', converter.vocab_size) from model3 import Model with open(model_path + '/submission.csv', 'w') as file: file.write('test_id,result' + '\n') batchsize = 1000 for i in range(0, len(test_samples), batchsize): # 内存不足 分批test print('>>>>:', i, '/', len(test_samples)) test_g = test_samples_generator(test_samples[i:i + batchsize]) model = Model(converter.vocab_size, FLAGS, test=False, embeddings=None) model.load(FLAGS.checkpoint_path) model.test(test_g, model_path) print('finished!')
def main(_): model_path = os.path.join('models', Config.file_name) vocab_file = os.path.join(model_path, 'vocab_tuples.pkl') # 获取测试问题 sens_tags_test = get_sens_tags('data/test.txt') # 数据处理 converter = TextConverter(None, vocab_file, max_vocab=Config.vocab_max_size) print('vocab size:', converter.vocab_size) # 产生测试样本 test_QA_arrs = converter.QAs_to_arr(sens_tags_test, Config.seq_length) # 加载上一次保存的模型 model = Model(Config, converter.vocab_size) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) # 测试 print('start to testing...') n = len(test_QA_arrs) for i in range(n): y_pre, y_cos = model.test(test_QA_arrs[i]) tags = [converter.int_to_tag(id) for id in y_pre[:test_QA_arrs[i][2]]] print('\nword / tag / pre') for j in range(test_QA_arrs[i][2]): print("{} / {} / {}".format(sens_tags_test[i][0][j], sens_tags_test[i][1][j], tags[j]))
def main(_): word_char = 'word' # 'word' or 'char' print('use word or char:',word_char) FLAGS.file_name = word_char+'_'+FLAGS.file_name print('model_path:',FLAGS.file_name) model_path = os.path.join('models', FLAGS.file_name) if os.path.exists(model_path) is False: os.makedirs(model_path) if FLAGS.file_name[-1] == '2': from model2 import Model elif FLAGS.file_name[-1] == '3': from model3 import Model elif FLAGS.file_name[-1] == '4': from model4 import Model elif FLAGS.file_name[-1] == '5': from model5 import Model else: from model1 import Model data_path,save_path = 'data','process_data1' converter = TextConverter(word_char, data_path, save_path, FLAGS.num_steps) embeddings = converter.embeddings if word_char == 'word': train_pkl = 'train_word.pkl' val_pkl = 'val_word.pkl' if word_char == 'char': train_pkl = 'train_char.pkl' val_pkl = 'val_char.pkl' train_samples = converter.load_obj(os.path.join(save_path, train_pkl)) train_g = batch_generator(train_samples, FLAGS.batch_size) val_samples = converter.load_obj(os.path.join(save_path, val_pkl)) val_g = val_samples_generator(val_samples) print('use embeding:',FLAGS.use_embedding) print('vocab size:',converter.vocab_size) model = Model(converter.vocab_size,FLAGS,test=False, embeddings=embeddings) # 继续上一次模型训练 FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path) if FLAGS.checkpoint_path: model.load(FLAGS.checkpoint_path) model.train(train_g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, val_g )
def test_save_file(self): testConverter = TextConverter(text=[ "We", "are", "accounted", "poor", "citizens,", "the", "patricians", "goodare", "accounted", "poor", "citizens,", "the", "patricians", "good" ], max_vocab=10) testConverter.save_to_file('test.pcl')
def main(_): converter = TextConverter(filename=FLAGS.converter_path) model = charRNN(converter.vocab_size, train=False) model.load(tf.train.latest_checkpoint(FLAGS.checkpoint_path)) start = converter.text_to_arr(FLAGS.start_string) arr = model.generate(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def test_batch_generator(self): with codecs.open('data/shakespeare.txt', encoding='utf-8') as f: text = f.read() converter = TextConverter(text, 35000) arr = converter.text_to_arr(text) g = batch_generator(arr, 32, 50) count = 0 for x, y in g: count += 1 print(count)
def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size): FLAGS.start_string = FLAGS.start_string.decode('utf-8') self.converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) self.tfmodel.load(FLAGS.checkpoint_path)
def test_vocab_size(self): testConverter = TextConverter(text=[ "We", "are", "accounted", "poor", "citizens,", "the", "patricians", "goodare", "accounted", "poor", "citizens,", "the", "patricians", "good" ], max_vocab=10) print(testConverter.vocab_size) print(testConverter.int_to_word(4)) print(testConverter.text_to_arr(['the'])) print(testConverter.arr_to_text([3, 4]))
def main(_): # # FLAGS.start_string = FLAGS.start_string#.decode('utf-8') word_char = 'word' # 'word' or 'char' print('use word or char:', word_char) FLAGS.file_name = word_char + '_' + FLAGS.file_name print('model_path:', FLAGS.file_name) model_path = os.path.join('models', FLAGS.file_name) if os.path.isdir(model_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path) if FLAGS.file_name[-1] == '2': from model2 import Model elif FLAGS.file_name[-1] == '3': from model3 import Model elif FLAGS.file_name[-1] == '4': from model4 import Model elif FLAGS.file_name[-1] == '5': from model5 import Model else: from model1 import Model data_path, save_path = 'data', 'process_data1' converter = TextConverter(word_char, data_path, save_path, FLAGS.num_steps) embeddings = converter.embeddings if word_char == 'word': test_pkl = 'test_word.pkl' if word_char == 'char': test_pkl = 'test_char.pkl' test_samples = converter.load_obj(os.path.join(save_path, test_pkl)) print('use embeding:', FLAGS.use_embedding) print('vocab size:', converter.vocab_size) with open(model_path + '/submission.csv', 'w') as file: file.write(str('y_pre') + '\n') for i in range(0, len(test_samples), 5000): # 内存不足 分批test print('>>>>:', i, '/', len(test_samples)) test_g = test_samples_generator(test_samples[i:i + 5000]) model = Model(converter.vocab_size, FLAGS, test=False, embeddings=embeddings) model.load(FLAGS.checkpoint_path) model.test(test_g, model_path) print('finished!')
def main(_): model_path = os.path.join('model', 'en') if os.path.exists(model_path) is False: os.makedirs(model_path) with open("data/shakespeare.txt") as f: text = f.read() print("=====>", len(text)) converter = TextConverter(text) converter.save(os.path.join(model_path, "converter.pkl")) arr = converter.text_to_arr(text) g = batch_generator(arr, batch_size, seq_len, converter=None) model = charRNN(converter.vocab_size) model.train(g, model_path)
def main(_): converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def sample(): with tf.Session() as sess: model_path = os.path.join(FLAGS.train_dir, FLAGS.model_name) converter = TextConverter(None, FLAGS.max_vocab_size, os.path.join(model_path, 'converter.pkl')) model = create_model(sess, converter.vocab_size, True, model_path) sys.stdout.write("> ") sys.stdout.flush() start_str = sys.stdin.readline().decode('utf-8') while start_str: start = converter.text_to_arr(start_str) samples = [c for c in start] initial_state = sess.run(model.initial_state) x = np.zeros((1, 1)) for c in start: x[0, 0] = c feed = {model.inputs: x, model.initial_state: initial_state} preds, final_state = sess.run( [model.proba_prediction, model.final_state], feed_dict=feed) initial_state = final_state c = pick_top_n(preds, converter.vocab_size) while c == converter.vocab_size - 1: c = pick_top_n(preds, converter.vocab_size) samples.append(c) for i in range(FLAGS.sample_length): x[0, 0] = c feed = {model.inputs: x, model.initial_state: initial_state} preds, final_state = sess.run( [model.proba_prediction, model.final_state], feed_dict=feed) initial_state = final_state c = pick_top_n(preds, converter.vocab_size) while c == converter.vocab_size - 1: c = pick_top_n(preds, converter.vocab_size) samples.append(c) print(converter.arr_to_text(np.array(samples))) sys.stdout.write("> ") sys.stdout.flush() start_str = sys.stdin.readline().decode('utf-8')
def main(_): FLAGS.start_string = FLAGS.start_string.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) print(converter.arr_to_text(arr))
def main(_): model_path = os.path.join('model', FLAGS.name) print(model_path) if os.path.exists(model_path) is False: os.makedirs(model_path) path_exist = False else: path_exist = True with codecs.open(FLAGS.input_file, encoding='utf-8') as f: text = f.read() converter = TextConverter(text, FLAGS.max_vocab) converter.save_to_file(os.path.join(model_path, 'converter.pkl')) arr = converter.text_to_arr(text) g = batch_generator(arr, FLAGS.num_seqs, FLAGS.num_steps) print(converter.vocab_size) model = CharRNN(converter.vocab_size, num_seqs=FLAGS.num_seqs, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size ) model_file_path = tf.train.latest_checkpoint(model_path) if path_exist: model.load(model_file_path) indexes = [] for dirpath, dirnames, filenames in os.walk(model_path): for name in filenames: filepath = os.path.join(dirpath, name) if filepath.endswith(".index"): indexes.append(int(name[6:-6])) indexes.sort() last_index = indexes[-1] model.step = last_index model.train(g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def generate(): tf.compat.v1.disable_eager_execution() converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = converter.text_to_arr(FLAGS.start_string) arr = model.sample(FLAGS.max_length, start, converter.vocab_size) return converter.arr_to_text(arr)
def main(_): FLAGS.start_string = FLAGS.start_string.decode('utf-8') converter = TextConverter(filename=FLAGS.converter_path) #创建文本转化器 if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path = tf.train.latest_checkpoint( FLAGS.checkpoint_path) #下载最新模型 model = CharRNN(converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) #加载模型 start = converter.text_to_arr(FLAGS.start_string) #将input text转为id arr = model.sample(FLAGS.max_length, start, converter.vocab_size) #输出为生成的序列 print(converter.arr_to_text(arr))
def main(_): model_path = os.path.join('models', FLAGS.file_name) if os.path.exists(model_path) is False: os.makedirs(model_path) if os.path.exists(os.path.join( model_path, 'converter.pkl')) or os.path.exists( os.path.join(model_path, 'QAs.pkl')) is False: print('词库文件不存在,创建...') QAs, text = load_origin_data('data/task3_train.txt') converter = TextConverter(text, 5000) converter.save_to_file(converter.vocab, os.path.join(model_path, 'converter.pkl')) converter.save_to_file(QAs, os.path.join(model_path, 'QAs.pkl')) else: converter = TextConverter( filename=os.path.join(model_path, 'converter.pkl')) QAs = converter.load_obj(filename=os.path.join(model_path, 'QAs.pkl')) QA_arrs = converter.QAs_to_arrs(QAs, FLAGS.num_steps) thres = int(len(QA_arrs) * 0.9) train_samples = QA_arrs[:thres] val_samples = QA_arrs[thres:] train_g = batch_generator(train_samples, FLAGS.batch_size) val_g = val_samples_generator(val_samples) print('use embeding:', FLAGS.use_embedding) print('vocab size:', converter.vocab_size) from model3 import Model model = Model(converter.vocab_size, FLAGS, test=False, embeddings=None) # 继续上一次模型训练 FLAGS.checkpoint_path = tf.train.latest_checkpoint(model_path) if FLAGS.checkpoint_path: model.load(FLAGS.checkpoint_path) model.train(train_g, FLAGS.max_steps, model_path, FLAGS.save_every_n, FLAGS.log_every_n, val_g)
def main(_): model_path = os.path.join('models', Config.file_name) et = TextConverter(text=None,save_dir='models/en_vocab.pkl', max_vocab=Config.en_vocab_size, seq_length = Config.seq_length) zt = TextConverter(text=None,save_dir='models/zh_vocab.pkl', max_vocab=Config.zh_vocab_size, seq_length = Config.seq_length+1) # +1是因为,decoder层序列拆成input=[:-1]和label=[1:] print('english vocab lens:',et.vocab_size) print('chinese vocab lens:',zt.vocab_size) # 加载上一次保存的模型 model = Model(Config) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) while True: # english_speek = 'what can i help you ?' # print('english:', english_speek) english_speek = input("english:") english_speek = english_speek.split() en_arr, arr_len = et.text_to_arr(english_speek) test_g = [np.array([en_arr,]), np.array([arr_len,])] output_ids = model.test(test_g, model_path, zt) strs = zt.arr_to_text(output_ids) print('chinese:',strs)
def main(_): model_path = os.path.join('model', Config.file_name) if os.path.exists(model_path) is False: os.makedirs(model_path) et = TextConverter(text=None, save_dir='model/en_vocab.pkl', max_vocab=Config.en_vocab_size, seq_length=Config.seq_length) zt = TextConverter(text=None, save_dir='model/zh_vocab.pkl', max_vocab=Config.zh_vocab_size, seq_length=Config.seq_length + 1) # +1是因为,decoder层序列拆成input=[:-1]和label=[1:] print('english vocab lens:', et.vocab_size) print('chines20000e vocab lens:', zt.vocab_size) en_arrs = et.get_en_arrs('data/train.tags.data.en_clear') zh_arrs = zt.get_en_arrs('data/train.tags.data.zh_clear') train_g = batch_generator(en_arrs, zh_arrs, Config.batch_size) # 加载上一次保存的模型 model = Model(Config) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) print('start to training...') model.train(train_g, model_path)
def main(_): model_path = os.path.join('models', Config.file_name) input_file = 'data/去除2和null.xlsx' vocab_file = os.path.join(model_path, 'vocab_label.pkl') # 数据处理 converter = TextConverter(None, vocab_file, max_vocab=Config.vocab_max_size, seq_length=Config.seq_length) print('vocab size:', converter.vocab_size) # 加载上一次保存的模型 model = Model(Config, converter.vocab_size) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) # 获取测试库数据 # test_libs = get_excel_libs('data/tianlong_libs.xlsx') # 用整个库3w+ QAs = get_excel_QAs(input_file) thres = int(0.8 * len(QAs)) test_QAs = QAs[thres:] test_libs = [r for q, r, y in test_QAs] # 用QAs test_libs_arrs = converter.libs_to_arrs(test_libs) # 产生匹配库向量 save_file = checkpoint_path + '_matul_state_QAs.pkl' if os.path.exists(save_file) is False: response_matul_state = model.test_to_matul(test_libs_arrs) with open(save_file, 'wb') as f: pickle.dump(response_matul_state, f) else: with open(save_file, 'rb') as f: response_matul_state = pickle.load(f) # 测试 print('start to testing...') QAY = [] k, n = 0, 0 for query, y_response, label in test_QAs: input_arr, input_len = converter.text_to_arr(query) indexs = model.test(input_arr, input_len, response_matul_state) responses = converter.index_to_response(indexs, test_libs) QAY.append((query, y_response, responses)) if responses[0] == y_response: k += 1 print(k, '/', n) n += 1 print('accuracy:', k / float(n)) result_xls = checkpoint_path + '_Q_for_QAs.xls' converter.save_to_excel(QAY, result_xls)
class Dianpin(Singleton): def __init__(self): self.text = '' self.tfmodel = None self.converter = None def model_built(self):#,vocab_size,sampling,lstm_size,num_layers,use_embedding,embedding_size): FLAGS.start_string = FLAGS.start_string.decode('utf-8') self.converter = TextConverter(filename=FLAGS.converter_path) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) self.tfmodel = CharRNN(self.converter.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) self.tfmodel.load(FLAGS.checkpoint_path) def final_predict(self): start = self.converter.text_to_arr(FLAGS.start_string) arr = self.tfmodel.sample(FLAGS.max_length, start, self.converter.vocab_size) return self.converter.arr_to_text(arr)
def main(_): model_path = os.path.join('models', Config.file_name) if os.path.exists(model_path) is False: os.makedirs(model_path) converter = TextConverter(vocab_dir='data/vocabs', max_vocab=Config.vocab_size, seq_length=Config.seq_length) print('vocab lens:', converter.vocab_size) en_arrs = converter.get_en_arrs('data/train/in.txt') de_arrs = converter.get_de_arrs('data/train/out.txt') train_g = batch_generator(en_arrs, de_arrs, Config.batch_size) # 加载上一次保存的模型 model = Model(Config) checkpoint_path = tf.train.latest_checkpoint(model_path) if checkpoint_path: model.load(checkpoint_path) print('start to training...') model.train(train_g, model_path)