def train(): # 流程 # 1.数据预处理 # 2.seq2seq # ======================================================== # 准备数据 print("train mode.......") print('准备数据') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) # 数据预处理 # buckets_dir 训练数据目录 # 获取列表,列表里是四个桶,每个桶里有各自的数据库内容 bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) # 每个桶的数据量添加 bucket_sizes = [] for i in range(len(buckets)): # 语句的尺寸 bucket_size = bucket_dbs[i].size # 不同的桶的数据量 bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) # 所有样本的数目 total_size = sum(bucket_sizes) # 获取所有的样本数目 print('共有数据 {} 条'.format(total_size)) # 开始建模与训练 gpu_options = tf.GPUOptions( allow_growth=True, # 允许GPU分配是一种增量分配的方式 per_process_gpu_memory_fraction=calc_gpu_fraction(FLAGS.gpu_fraction)) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) as sess: # 构建模型(每个桶对应一个训练对象\损失函数\summary相关信息;但是这四套代码是参数共享的) model = create_model(sess, False) # 初始化变量&模型恢复 print("开始进行模型初始化以及模型恢复操作.....") sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and ckpt.model_checkpoint_path: print("Load old model from : ", ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) model.saver.recover_last_checkpoints( ckpt.all_model_checkpoint_paths) else: print("Not exist old model") # 计算每个桶的样本的累计占比(1号桶的占比, 1+2号桶的占比, 1+2+3号桶的占比, 1+2+3+4号桶的占比) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 开始训练 metrics = ' '.join( ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}']) # 设置bars_max跟踪进度 bars_max = 20 writer = tf.summary.FileWriter('log', graph=sess.graph) merges = [] # 针对每个桶(每个训练对象)获取对应的summary的的可视化输出对象 for b_idx in model.bucket_to_summary_list: merges.append(tf.summary.merge( model.bucket_to_summary_list[b_idx])) print("开始模型训练.....") with tf.device('/gpu:0'): for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) # 获取开始的时间 time_start = time.time() # 设置开始的进度为 0 epoch_trained = 0 batch_loss = [] while True: # 随机选择一个要训练的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) # 获取数据(从随机的桶中获取数据,获取batch_size: 16条数据) data, _ = model.get_batch_data(bucket_dbs, bucket_id) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_id, data) # run 迭代训练 _, step_loss, summary_merge, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False, merges[bucket_id]) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) # 获取现在的时间 time_now = time.time() # 获取经历的时间 time_spend = time_now - time_start # 获取时间的进度 time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) # 获取现在的进度比例 percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 # bars该显示多少个计算,最多显示20个 bars = math.floor(percent / 100 * bars_max) # 进行输出操作,显示=号,-号的数量不同 sys.stdout.write( metrics.format( '=' * int(bars) + '-' * int(bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate))) # 进行输出 sys.stdout.flush() if summary_merge is not None: writer.add_summary(summary_merge, global_step=epoch_index) if epoch_trained >= FLAGS.num_per_epoch: model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name), global_step=epoch_index) break print('\n') # 最终再来一次模型持久化输出 model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): """训练模型""" # 准备数据 print("train mode") print('准备数据') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) # 开始建模与训练 with tf.Session(config=tf.ConfigProto(allow_soft_placement=False, gpu_options=gpu_options)) as sess: # 构建模型 model = create_model(sess, False) # 初始化变量 sess.run(tf.initialize_all_variables()) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) #print("ckpt path : ", ckpt.model_checkpoint_path) if ckpt != None: print("读取模型 : ", ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print("not exist old model") buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 开始训练 metrics = ' '.join( ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}']) bars_max = 20 with tf.device('/gpu:0'): for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] while True: # 选择一个要训练的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) data, data_in = model.get_batch_data(bucket_dbs, bucket_id) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data) _, step_loss, output = model.step(sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write( metrics.format('=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate))) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name), global_step=epoch_index) break print('\n')
def train(): """训练模型""" # 准备数据 print('准备数据') bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) # 开始建模与训练 with tf.Session() as sess: # 构建模型 model = create_model(sess, False) # 初始化变量 sess.run(tf.initialize_all_variables()) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 开始训练 metrics = ' '.join([ '\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}', 'learning rate={:.5f}' ]) bars_max = 20 for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] # previous_losses=[] current_step=0 loss=0 while True: # 选择一个要训练的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) data, data_in = model.get_batch_data( bucket_dbs, bucket_id ) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data ) _, step_loss, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False ) loss=step_loss/FLAGS.steps_per_checkpoint current_step+=1 if current_step % FLAGS.steps_per_checkpoint == 0: if len(previous_losses)>2 and loss>max(previous_losses[-3:]): sess.run(model.learning_rate_decay_op) previous_losses.append(loss) loss=0 if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name)) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write(metrics.format( '=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate),model.learning_rate.eval() )) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): """训练模型""" # 准备数据 print('准备数据') bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.66) config = tf.ConfigProto(gpu_options=gpu_options) # 防止 out of memory config.gpu_options.allocator_type = 'BFC' # 开始建模与训练 with tf.Session() as sess: # 构建模型 model = create_model(sess, False) # 初始化变量 sess.run(tf.initialize_all_variables()) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 开始训练 metrics = ' '.join([ '\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}' ]) bars_max = 20 for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] while True: # 选择一个要训练的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) data, data_in = model.get_batch_data( bucket_dbs, bucket_id ) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data ) _, step_loss, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False ) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write(metrics.format( '=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate) )) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): # Prepare Headline data. print("Preparing Headline data in %s" % FLAGS.data_dir) src_train, dest_train, src_dev, dest_dev, _, _ = data_utils.prepare_headline_data( FLAGS.data_dir, FLAGS.vocab_size) # device config for CPU usage # config = tf.ConfigProto(device_count={"CPU": 4}, # limit to 4 CPU usage # inter_op_parallelism_threads=1, # intra_op_parallelism_threads=2) # n threads parallel for ops config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False) # Read data into buckets and compute their sizes. print("Reading development and training data (limit: %d)." % FLAGS.max_train_data_size) dev_set = read_data(src_dev, dest_dev) train_set = read_data(src_train, dest_train, FLAGS.max_train_data_size) train_bucket_sizes = [len(train_set[b]) for b in xrange(len(buckets))] train_total_size = float(sum(train_bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. trainbuckets_scale = [ sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes)) ] # This is the training loop. # 显示时间用 metrics = ' '.join( ['\r{:.1f}%', '{}/{}', 'loss={:.3f}', 'gradients={:.3f}', '{}/{}']) bars_max = 20 for current_step in range(FLAGS.num_epoch): print("\n") print('Epoch {}:'.format(current_step)) epoch_trained = 0 batch_loss = [] batch_gradients = [] time_start = time.time() # index_sum = 0 while True: # Choose a bucket according to data distribution. We pick a random number # in [0, 1] and use the corresponding interval in trainbuckets_scale. random_number_01 = np.random.random_sample() bucket_id = min([ i for i in xrange(len(trainbuckets_scale)) if trainbuckets_scale[i] > random_number_01 ]) # Get a batch and make a step. encoder_inputs, decoder_inputs, target_weights = model.get_batch( train_set, bucket_id) step_gradients, step_loss, _ = model.step( sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, False) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) batch_gradients.append(step_gradients) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 # bars = math.floor(percent / 100 * bars_max) sys.stdout.write( metrics.format( percent, epoch_trained, FLAGS.num_per_epoch, # 对batch loss 取平均值 np.mean(batch_loss), np.mean(batch_gradients), data_utils.time(time_spend), data_utils.time(time_estimate))) print("\n") sys.stdout.flush() # index_sum += 1 # if index_sum > 4: # sys.exit() if FLAGS.num_per_epoch < epoch_trained: break # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.train_dir, "headline_large.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step)
def train(): """训练模型""" # 准备数据 print('准备数据') #数据预处理有两步:1、decode_conv 2、data_utils #原始数据集不是很好的问答式数据集。用decode_conv处理的数据,假定有ABC三个句子,则处理成两句问答:A:B,B:C,然后都插入到sqlite3里 #生成一个conversion.db文件,然后使用data_utils来进行语句处理,即对这个db文件做进一步处理 #对应四种格式,5_15,10_20,15_25,20_30,分别代表问句和回答句的字数上限。比如5_15即问句不超过5个字且答句不超过15个字。 #这种方式也和命名实体识别的一个性质,是为了能最小padding,进行局部padding,如果有句子太长的,但是不太多,那么可以滤掉。 #因为一般的对话都不会太长 bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) #到这里为止还只是拿到四个bucket里的数据,并统计了一下总的数据条数 # 开始建模与训练 with tf.Session() as sess: #整体流程即:1、创建模型 2、接收数据,并转换成模型可接收的类型 3、放入模型,计算损失 4、更新参数 # 构建模型 model = create_model(sess, False) # 初始化变量 sess.run(tf.global_variables_initializer()) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] #i=0,1,2,3==>bucket_sizes[: 1], # 开始训练 metrics = ' '.join( ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}']) bars_max = 20 with tf.device('/gpu:0'): for epoch_index in range(1, FLAGS.num_epoch + 1600): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] while True: # 选择一个要训练的bucket random_number = np.random.random_sample() #tmp=[] #for i in range(len(buckets_scale)): #if buckets_scale[i] > random_number: #tmp.append(i) #bucket_id = min(tmp) bucket_id = 1 if random_number <= 0.25 else 2 if random_number > 0.25 and random_number <= 0.5 else 3 if random_number > 0.5 and random_number < 0.75 else 4 bucket_id -= 1 #先选择对应的问答对长度,因为后面无论是padding还是生结果,都是根据这个位数来的 #bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number]) #拿出64个问答对,data 和data_in 问答倒转 data, data_in = model.get_batch_data( bucket_dbs, bucket_id) #先获取到问答对和答问对 encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data ) #再得到padding后的encoder_inputs,decoder_inputs和新生成的权重decoder_weights #而这里的encoder_inputs,decoder_inputs都只是对应的字ID信息,而decoder_weights则是1和0组成的,也是和字位置一对一对应 #通过源码可以看出,ID只是初步信息,随机初始化一个embedding是embedding_attention_seq2seq内部会有的 _, step_loss, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False ) #给定需要喂入的参数,即encoder、decoder、weights以及选择的bucket_id #根据训练和测试状态,获取输出结果。 epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) #为了计算损失用 time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write( metrics.format('=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate))) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) if epoch_index % 800 == 0: model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): """训练模型""" # 准备数据 print('准备数据') bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) # 开始建模与训练 with tf.Session() as sess: # 构建模型 model = create_model(sess, False) # 初始化变量 sess.run(tf.initialize_all_variables()) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 开始训练 metrics = ' '.join([ '\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}' ]) bars_max = 20 for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] while True: # 选择一个要训练的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) data, data_in = model.get_batch_data( bucket_dbs, bucket_id ) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data ) _, step_loss, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False ) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write(metrics.format( '=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate) )) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): """訓練模型""" # 准备数据 print('準備數據') bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有數據 {} 條'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有數據 {} 條'.format(total_size)) # 開始建模 with tf.Session() as sess: # 構建模型 model = create_model(sess, False) # 初始化變量 sess.run(tf.initialize_all_variables()) buckets_scale = [ sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes)) ] # 開始訓練 metrics = ' '.join( ['\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}']) bars_max = 20 for epoch_index in range(1, FLAGS.num_epoch + 1): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 batch_loss = [] while True: # 選擇一個要訓練的bucket random_number = np.random.random_sample() bucket_id = min([ i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number ]) data, data_in = model.get_batch_data(bucket_dbs, bucket_id) encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data) _, step_loss, output = model.step(sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write( metrics.format('=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate))) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))
def train(): """训练模型""" print('数据准备中...') bucket_dbs = data_utils.read_bucket_dbs(FLAGS.buckets_dir) bucket_sizes = [] for i in range(len(buckets)): bucket_size = bucket_dbs[i].size bucket_sizes.append(bucket_size) print('bucket {} 中有数据 {} 条'.format(i, bucket_size)) total_size = sum(bucket_sizes) print('共有数据 {} 条'.format(total_size)) with tf.Session() as sess: model = create_model(sess, False) sess.run(tf.global_variables_initializer()) # 计算每个文件数据占比 buckets_scale = [sum(bucket_sizes[:i + 1]) / total_size for i in range(len(bucket_sizes))] # 格式化控制台输出 metrics = ' '.join([ '\r[{}]', '{:.1f}%', '{}/{}', 'loss={:.3f}', '{}/{}' ]) bars_max = 20 with tf.device('/gpu:0'): for epoch_index in range(1, FLAGS.num_epoch + 1600): print('Epoch {}:'.format(epoch_index)) time_start = time.time() epoch_trained = 0 # 每个epoch已经训练的样本数 batch_loss = [] while True: # 随机选择一个要训练的bucket_id random_number = np.random.random_sample() bucket_id = min([i for i in range(len(buckets_scale)) if buckets_scale[i] > random_number]) # 拿出64个问答对 data, data_in 问答倒转 data, data_in = model.get_batch_data( bucket_dbs, bucket_id ) # 将问答对转换为模型训练可接受的格式 # bucket_10_20这个bucket对应的维度为:10*64 20*64 20*64 encoder_inputs, decoder_inputs, decoder_weights = model.get_batch( bucket_dbs, bucket_id, data ) # 训练 _, step_loss, output = model.step( sess, encoder_inputs, decoder_inputs, decoder_weights, bucket_id, False ) epoch_trained += FLAGS.batch_size batch_loss.append(step_loss) time_now = time.time() time_spend = time_now - time_start time_estimate = time_spend / (epoch_trained / FLAGS.num_per_epoch) percent = min(100, epoch_trained / FLAGS.num_per_epoch) * 100 bars = math.floor(percent / 100 * bars_max) sys.stdout.write(metrics.format( '=' * bars + '-' * (bars_max - bars), percent, epoch_trained, FLAGS.num_per_epoch, np.mean(batch_loss), data_utils.time(time_spend), data_utils.time(time_estimate) )) sys.stdout.flush() if epoch_trained >= FLAGS.num_per_epoch: break print('\n') if not os.path.exists(FLAGS.model_dir): os.makedirs(FLAGS.model_dir) if epoch_index%800==0: model.saver.save(sess, os.path.join(FLAGS.model_dir, FLAGS.model_name))