def run_training(): poems_vector,word_to_int,vocabularies = process_poems(FLAGS.file_path) batch_inputs,batch_outputs = generate_batch(FLAGS.batch_size,poems_vector,word_to_int) input_data = tf.placeholder(tf.int32, [FLAGS.batch_size,None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size,None]) end_points = rnn_model(model='lstm',input=input_data,output_data = output_targets,vocab_size = len(vocabularies) ,run_size = 128,num_layers = 2,batch_size = 64,learning_rate = 0.01)
def run_training(): if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len( vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) saver = tf.train.Saver(tf.global_variables()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess) # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) sess.run(init_op) start_epoch = 0 checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) if checkpoint: saver.restore(sess, checkpoint) print("[INFO] restore from the checkpoint {0}".format(checkpoint)) start_epoch += int(checkpoint.split('-')[-1]) print('[INFO] start training...') try: for epoch in range(start_epoch, FLAGS.epochs): n = 0 n_chunk = len(poems_vector) // FLAGS.batch_size for batch in range(n_chunk): loss, _, _ = sess.run([ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={input_data: batches_inputs[n], output_targets: batches_outputs[n]}) n += 1 print('[INFO] Epoch: %d , batch: %d , training loss: %.6f' % (epoch, batch, loss)) if epoch % 6 == 0: saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) except KeyboardInterrupt: print('[INFO] Interrupt manually, try saving checkpoint for now...') saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) print('[INFO] Last epoch were saved, next time will start from epoch {}.'.format(epoch))
def run_training(): # 预处理 把话转化为向量 文字转化为整数 返回语料库 poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) batch_inputs, batch_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) # inout携程placehoder none 输出的结果 # 之后要做交叉熵 target input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) # 定义模型 # lstm效果更好 # run size rnn有中间隐层 有多少神经元2 end_points = rnn_model(model='lstm', input=input_data, output_data=output_targets, vocab_size=len(vocabularies), run_size=32, num_layers=2, batch_size=10, learning_rate=0.01)
def run_training(): # 检查点保存路径 print('its_not_ok:', FLAGS.checkpoints_dir) if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) # 引入预处理 # 这里返回诗集转换成向量的数据,字与数字映射, 字集 poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) # batch_size 64 poems_vector 转为数字的映射 word_to_int:字与数字映射 batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) # 返回输入与输出的batch信息 # 输入、输出 占位符 input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(vocabularies), run_size=128, num_layers=2, batch_size=FLAGS.batch_size, learning_rate=FLAGS.learning_rate) # 保存 saver = tf.train.Saver(tf.global_variables()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init_op) start_epoch = 0 checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) if checkpoint: saver.restore(sess, checkpoint) print("[INFO] restore from the checkpoint {0}".format(checkpoint)) start_epoch += int(checkpoint.split('-')[-1]) print('[INFO] start training...') try: for epoch in range(start_epoch, FLAGS.epochs): n = 0 n_chunk = len(poems_vector) // FLAGS.batch_size for batch in range(n_chunk): loss, _, _ = sess.run( [ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={ input_data: batches_inputs[n], output_targets: batches_outputs[n] }) n += 1 print('[INFO] Epoch: %d, batch: %d, training loss: %.6f' % (epoch, batch, loss)) if epoch % 6 == 0: saver.save(sess, './model/', global_step=epoch) except KeyboardInterrupt: print( '[INFO] Interrupt manually, try saving checkpoint for now ..') saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) print( '[INFO] Last epoch were saved, next time will start from epoch {}.' .format(epoch))
def run_training(): if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) saver = tf.train.Saver() ''' tf.group(tensor1, tensor2): tensor1和tensor2是操作,用于操作集合起来。比如: generator_train_op = tf.train.AdamOptimizer(g_loss, ...) discriminator_train_op = tf.train.AdamOptimizer(d_loss,...) train_ops = tf.groups(generator_train_op ,discriminator_train_op) with tf.Session() as sess: sess.run(train_ops) 一旦运行了train_ops,那么里面的generator_train_op和discriminator_train_op都将被调用 这里注意的是:tf.group()返回的是个操作,而不是值,如果你想tensor1和tensor2填充Variable 那么返回是None 如果真想返回值,那么可以用tf.tuple() 全局变量:用来初始化计算图中的全局的变量,全局变量是指创建的变量在tf.GraphKeys.GLOBAL_VARIABLES中, 在使用Variable创建变量时默认是collections默认是tf.GraphKeys.GLOBAL_VARIABLES 局部变量:初始化计算图中所有的局部变量,局部变量是指创建的变量在tf.GraphKeys.LOCAL_VARIABLES中 a = tf.Variable(1,name="a",collections=[tf.GraphKeys.LOCAL_VARIABLES]) 备注:在使用saver的时候,局部变量是不存在在模型文件中的 ''' init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init_op) start_epoch = 0 checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) if checkpoint: saver.restore(sess, checkpoint) print('[INFO] restore from the checkpoint %s' % checkpoint) start_epoch += int(checkpoint.split('-')[-1]) print('[INFO] start training ... ') try: for epoch in range(start_epoch, FLAGS.epochs): n = 0 n_chunk = len(poems_vector) // FLAGS.batch_size for batch in range(n_chunk): loss, _, _ = sess.run( [ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={ input_data: batches_inputs[n], output_targets: batches_inputs[n] }) n += 1 if epoch % 1 == 0: saver.save(sess, './model/', global_step=epoch) except KeyboardInterrupt: print( '[INFO] Interrupt manually, try saving checkpoint for now ...') saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) print( '[INFO] Last epoch were saved, next time will start from epoch %d' % epoch)
def run_training(): # 检测模型参数文件夹及父文件夹, 不存在则新建 if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)): os.mkdir(os.path.dirname(FLAGS.checkpoints_dir)) if not os.path.exists(FLAGS.checkpoints_dir): os.mkdir(FLAGS.checkpoints_dir) # 读取诗集文件 # 依次得到数字ID表示的诗句、汉字-ID的映射map、所有的汉字的列表 poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path) # 按照batch读取输入和输出数据 batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size, poems_vector, word_to_int) # 声明输入、输出的占位符 input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None]) # 通过rnn模型得到结果状态集 end_points = rnn_model(model='lstm', input_data=input_data, output_data=output_targets, vocab_size=len(vocabularies), rnn_size=128, num_layers=2, batch_size=64, learning_rate=FLAGS.learning_rate) # 初始化saver和session saver = tf.train.Saver(tf.global_variables()) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) with tf.Session() as sess: sess.run(init_op) start_epoch = 0 # 加载上次的模型参数(如果有) checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoints_dir) if checkpoint: saver.restore(sess, checkpoint) print("[INFO] restore from the checkpoint {0}".format(checkpoint)) # 如果有模型参数, 则取出对应的epoch, 训练从该epoch开始训练 start_epoch += int(checkpoint.split('-')[-1]) # 开始训练 print('[INFO] start training...') try: for epoch in range(start_epoch, FLAGS.epochs): n = 0 # 计算一个epoch需要多少次batch训练完, 有余数则忽略掉末尾部分 n_chunk = len(poems_vector) // FLAGS.batch_size for batch in range(n_chunk): # 训练并计算loss # batches_inputs[n]: 第n个batch的输入数据 # batches_outputs[n]: 第n个batch的输出数据 loss, _, _ = sess.run( [ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={ input_data: batches_inputs[n], output_targets: batches_outputs[n] }) n += 1 print( '[INFO] Epoch: %d , batch: %d , training loss: %.6f' % (epoch, batch, loss)) # 每训练6个epoch进行一次模型保存 if epoch % 6 == 0: saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) except KeyboardInterrupt: # 用户手动退出时, 尝试保存模型参数 print( '[INFO] Interrupt manually, try saving checkpoint for now...') saver.save(sess, os.path.join(FLAGS.checkpoints_dir, FLAGS.model_prefix), global_step=epoch) print( '[INFO] Last epoch were saved, next time will start from epoch {}.' .format(epoch))
def run_training(): """ 模型训练 :return: None """ #调用process_poems方法预处理数据 并获取唐诗向量数据,汉字映射表,以及词汇表 poems_vector, word_to_int, vocabularies = process_poems(file_path) #调用generate_batch方法获取批处理特征值和目标值 batches_inputs, batches_outputs = generate_batch(batch_size, poems_vector, word_to_int) #batches_inputs和batches_outputs均为505块64行length列 length值不定 #定义批处理的输入数据和输出目标数据的占位tensor 形状为 [64, ?] length长度为不定值 input_data = tf.placeholder(tf.int32, [batch_size, None]) #[64, ?] output_targets = tf.placeholder(tf.int32, [batch_size, None]) #[64, ?] #调用模型返回训练数据 end_points = rnn_model(model='rnn', input_data=input_data, output_data=output_targets, vocab_size=len(vocabularies), rnn_size=128, num_layers=2, batch_size=batch_size, learning_rate=learning_rate) #实例化一个模型保存对象供后续保存模型使用 saver = tf.train.Saver(tf.global_variables()) #创建初始化组合操作op 用于初始化全局变量和局部变量 init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) #创建tf会话运行op with tf.Session() as sess: #初始化变量 sess.run(init_op) #定义训练代数初始为第0代 start_epoch = 0 #打印提示训练开始 print('[INFO] 开始训练...') try: #保存训练损失信息 后续写入mongodb数据库 train_losses = [] #循环epoch代进行训练 for epoch in range(start_epoch, epochs): n = 0 #用于保存每个epoch下第几次训练的序号 #计算训练一共有多少个数据块 每64首唐诗一个数据块 n_chunk实质为540,即全部数据包含540个批次 n_chunk = len(poems_vector) // batch_size #当前epoch下进行540次循环训练,每次训练64首唐诗 for batch in range(n_chunk): #运行调用模型返回的op并且传入当前批次的特征值和目标值,即之前定义好的占位张量 #这里只接收返回的损失函数,训练状态以及最小化损失op只运行,无需保存返回值 loss, _, _ = sess.run( [ end_points['total_loss'], end_points['last_state'], end_points['train_op'] ], feed_dict={ input_data: batches_inputs[n], output_targets: batches_outputs[n] }) n += 1 #训练批次序号+1 #打印提示当前epoch序号,batch序号, 当前交叉熵损失 print('[INFO] epoch序号: %d , batch序号: %d , 当前交叉熵损失: %.6f' % (epoch, batch, loss)) #保存当前批次训练信息供后续写入mongoDB train_loss = { "epoch序号": epoch, "batch序号": batch, "当前交叉熵损失": float(loss) } train_losses.append(train_loss) #每训练6个epoch保存一次模型 global_step=epoch表示将epoch序号加入到保存模型文件后缀 if epoch % 6 == 0: saver.save(sess, './model/rnn_model/', global_step=epoch) #4代训练全部完成时打印提示信息 print("[INFO] 训练已全部完成") #将训练信息写入数据库 # 创建一个mongodb连接对象 myclient = pymongo.MongoClient("mongodb://localhost:27017/") # 创建一个数据库 名为train_loss mydb = myclient["train_loss"] # 创建一个集合 mycol = mydb["rnn_train_loss"] # 向集合插入文档 mycol.insert_many(train_losses) #关闭数据库连接 myclient.close() except KeyboardInterrupt: #处理用户中断执行异常 print('[INFO] 训练出现异常中断')