def train_ops(points, covars, labels, true_state, is_training, num_classes, voxel_num, epoch_batch_num): ops={} with tf.device('/cpu:0'): global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps=epoch_batch_num*FLAGS.decay_epoch lr=tf.train.exponential_decay(FLAGS.lr_init,global_step,decay_steps,FLAGS.decay_rate,staircase=True) lr=tf.maximum(FLAGS.lr_clip,lr) tf.summary.scalar('learning rate',lr) decay_steps=epoch_batch_num*FLAGS.recon_decay_epoch recon_loss_ratio=tf.train.exponential_decay(1.0,global_step,decay_steps,FLAGS.recon_decay_rate,staircase=True) tf.summary.scalar('reconstruction loss ratio',recon_loss_ratio) opt=tf.train.AdamOptimizer(lr) with tf.name_scope('split_data'): tower_points=tf.split(points, FLAGS.num_gpus) tower_covars=tf.split(covars,FLAGS.num_gpus) tower_labels=tf.split(labels,FLAGS.num_gpus) tower_true_state=tf.split(true_state,FLAGS.num_gpus) reuse=False tower_grads=[] tower_recon_grads=[] tower_losses,tower_recon_losses=[],[] tower_logits=[] tower_voxel_state=[] for i in range(FLAGS.num_gpus): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)): loss,recon_loss,logits,voxel_state\ =tower_loss(tower_points[i], tower_covars[i],tower_labels[i], tower_true_state[i], is_training, num_classes,voxel_num,reuse) grad=opt.compute_gradients(loss+(recon_loss*recon_loss_ratio)) tower_grads.append(grad) all_var=tf.trainable_variables() recon_var=[var for var in all_var if var.name.startswith('point_mlp') or \ var.name.startswith('fc') or var.name.startswith('voxel_state')] recon_grad=opt.compute_gradients(recon_loss*recon_loss_ratio,var_list=recon_var) tower_recon_grads.append(recon_grad) tower_losses.append(loss) tower_recon_losses.append(recon_loss) tower_logits.append(logits) tower_voxel_state.append(voxel_state) update_op=tf.get_collection(tf.GraphKeys.UPDATE_OPS) reuse=True avg_grad=average_gradients(tower_grads) avg_recon_grad=average_gradients(tower_recon_grads) with tf.control_dependencies(update_op): apply_grad_op=opt.apply_gradients(avg_grad,global_step=global_step) apply_recon_grad_op=opt.apply_gradients(avg_recon_grad) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge(summaries) total_loss_op=tf.add_n(tower_losses)/FLAGS.num_gpus total_recon_loss_op=tf.add_n(tower_recon_losses)/FLAGS.num_gpus logits_op=tf.concat(tower_logits,axis=0) preds_op=tf.argmax(logits_op,axis=1) correct_num_op=tf.reduce_sum(tf.cast(tf.equal(preds_op,labels),tf.float32)) voxel_state_op=tf.concat(tower_voxel_state,axis=0) ops['total_loss']=total_loss_op ops['total_recon_loss']=total_recon_loss_op ops['apply_grad']=apply_grad_op ops['apply_recon_grad']=apply_recon_grad_op ops['logits']=logits_op ops['preds']=preds_op ops['correct_num']=correct_num_op ops['summary']=summary_op ops['global_step']=global_step ops['voxel_state']=voxel_state_op return ops
def train_ops(points, covars, grids, epoch_batch_num): ops = {} with tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps = epoch_batch_num * FLAGS.decay_epoch lr = tf.train.exponential_decay(FLAGS.lr_init, global_step, decay_steps, FLAGS.decay_rate, staircase=True) decay_steps = epoch_batch_num * 15 reverse_color_ratio = tf.train.exponential_decay(0.99, global_step, decay_steps, 0.9, staircase=True) color_ratio = tf.constant(1.0, tf.float32) - reverse_color_ratio tf.summary.scalar('learning rate', lr) tf.summary.scalar('color_ratio', color_ratio) opt = tf.train.AdamOptimizer(lr) tower_points = tf.split(points, FLAGS.num_gpus) tower_covars = tf.split(covars, FLAGS.num_gpus) reuse = False tower_grads = [] tower_losses = [] tower_gen_pts = [] for i in range(FLAGS.num_gpus): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)): loss, gen_pts = tower_loss(tower_points[i], tower_covars[i], grids, color_ratio, reuse, 3) # print tf.trainable_variables() grad = opt.compute_gradients(loss, tf.trainable_variables()) tower_grads.append(grad) tower_losses.append(loss) tower_gen_pts.append(gen_pts) reuse = True avg_grad = average_gradients(tower_grads) update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_op): apply_grad_op = opt.apply_gradients(avg_grad, global_step=global_step) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge(summaries) total_loss_op = tf.add_n(tower_losses) gen_pts_op = tf.concat(tower_gen_pts, axis=0) ops['total_loss'] = total_loss_op ops['apply_grad'] = apply_grad_op ops['gen_pts'] = gen_pts_op ops['summary'] = summary_op ops['global_step'] = global_step return ops
def train_ops(feats, labels, is_training, epoch_batch_num): ops = {} with tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps = epoch_batch_num * FLAGS.decay_epoch lr = tf.train.exponential_decay(FLAGS.lr_init, global_step, decay_steps, FLAGS.decay_rate, True) tf.summary.scalar('learning rate', lr) opt = tf.train.AdamOptimizer(lr) tower_feats = tf.split(feats, FLAGS.num_gpus) tower_labels = tf.split(labels, FLAGS.num_gpus) reuse = False tower_grads = [] tower_losses = [] tower_logits = [] for i in range(FLAGS.num_gpus): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)): loss, logits = tower_loss(tower_feats[i], tower_labels[i], FLAGS.num_classes, is_training, reuse) # print tf.trainable_variables() grad = opt.compute_gradients(loss, tf.trainable_variables()) tower_grads.append(grad) tower_losses.append(loss) tower_logits.append(logits) batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) # print batchnorm_updates reuse = True # todo: the batchnorm updates will copy to another gpu? avg_grad = average_gradients(tower_grads) with tf.control_dependencies(batchnorm_updates): apply_grad_op = opt.apply_gradients(avg_grad, global_step=global_step) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge(summaries) preds = tf.argmax(tf.concat(tower_logits, axis=0), axis=1) correct_pred_op = tf.reduce_sum( tf.cast(tf.equal(preds, labels), tf.float32)) total_loss_op = tf.add_n(tower_losses) ops['correct_num'] = correct_pred_op ops['total_loss'] = total_loss_op ops['apply_grad'] = apply_grad_op ops['summary'] = summary_op ops['global_step'] = global_step ops['preds'] = preds return ops
def train_ops(points, labels, is_training, num_classes, epoch_batch_num): ops = {} with tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps = epoch_batch_num * FLAGS.decay_epoch lr = tf.train.exponential_decay(FLAGS.lr_init, global_step, decay_steps, FLAGS.decay_rate, staircase=True) lr = tf.maximum(FLAGS.lr_clip, lr) tf.summary.scalar('learning rate', lr) opt = tf.train.AdamOptimizer(lr) reuse = False tower_grads = [] tower_losses = [] tower_logits = [] for i in range(FLAGS.num_gpus): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)): # print points[i],labels[i] loss, logits = tower_loss(points[i], labels[i], is_training, num_classes, reuse) tower_grads.append(opt.compute_gradients(loss)) tower_losses.append(loss) tower_logits.append(tf.squeeze(logits, axis=0)) update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) reuse = True avg_grad = average_gradients(tower_grads) with tf.control_dependencies(update_op): apply_grad_op = tf.group( opt.apply_gradients(avg_grad, global_step=global_step)) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge(summaries) total_loss_op = tf.add_n(tower_losses) / FLAGS.num_gpus logits_op = tf.concat(tower_logits, axis=0) preds_op = tf.argmax(logits_op, axis=1) flatten_labels = [] for i in xrange(FLAGS.num_gpus): flatten_labels.append( tf.squeeze(labels[i], axis=0, name='squeeze_labels_{}'.format(i))) flatten_labels = tf.concat(flatten_labels, axis=0) correct_num_op = tf.reduce_sum( tf.cast(tf.equal(preds_op, flatten_labels), tf.float32)) ops['total_loss'] = total_loss_op ops['apply_grad'] = apply_grad_op ops['logits'] = logits_op ops['preds'] = preds_op ops['correct_num'] = correct_num_op ops['summary'] = summary_op ops['global_step'] = global_step return ops
def train(cfg, logger, model_name): if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) model_dir = os.path.join(checkpoint_dir, model_name) print('[!!!] model name:{}'.format(model_dir)) logger.info('[!!!] model name:{}'.format(model_dir)) if not os.path.exists(model_dir): os.makedirs(model_dir) ckpt_path = os.path.join(model_dir, model_name) best_model_ckpt_dir = os.path.join(model_dir, 'best_model') if not os.path.exists(best_model_ckpt_dir): os.makedirs(best_model_ckpt_dir) best_ckpt_path = os.path.join(best_model_ckpt_dir, 'best_model') trick_model_ckpt_dir = os.path.join(model_dir, 'trick_model') if not os.path.exists(trick_model_ckpt_dir): os.makedirs(trick_model_ckpt_dir) trick_ckpt_path = os.path.join(trick_model_ckpt_dir, 'trick_model') lr = cfg.lr with tf.device('/cpu:0'): g_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='g_step') assert cfg.opt in ['Adam', 'SGD'], '[!!!] wrong optimizer name' if cfg.opt == 'Adam': optimizer = tf.train.AdamOptimizer(learning_rate=lr, name='optimizer') elif cfg.opt == 'SGD': optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr, name='optimizer') else: print('[!!!] wrong optimizer name') small_batch = cfg.batch_size // gpu_nums iterator = data_iterator(args.dataset, 'train', cfg, small_batch, tfrecord_root_dir=tfrecord_root_dir, logger=logger) # generator_tower_grads tower_grads = [] mae_losses = [] mse_losses = [] losses = [] for i in range(gpu_nums): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)) as scope: images, labels = iterator.get_next() # labels_resized = tf.image.resize_images(labels, # [img_rows // fac, img_cols // fac]) with tf.variable_scope('model', reuse=(i > 0)): model_b = CSRNet(cfg, images, small_batch, 'b') # [batch, h, w, c] outputs = model_b.output if i == 0: # print model model_b.full_model.summary() model_b.full_model.summary(print_fn=logger.info) loss = compute_euclidean_distance(outputs, labels) losses.append(loss) mae_loss = compute_mae_error(outputs, labels) mae_losses.append(mae_loss) mse_loss = compute_mse_error(outputs, labels) mse_losses.append(mse_loss) # # 重用variable tf.get_variable_scope().reuse_variables() # add summaries summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope) summaries.append( tf.summary.scalar(tensor=loss, name='loss')) summaries.append( tf.summary.scalar(tensor=mae_loss, name='mae_loss')) summaries.append( tf.summary.scalar(tensor=mse_loss, name='mse_loss')) summaries.append( tf.summary.image(tensor=images, name='images')) summaries.append( tf.summary.image(tensor=tf.map_fn( lambda img: colorize(img, cmap='jet'), labels), name='label')) summaries.append( tf.summary.image(tensor=tf.map_fn( lambda img: colorize(img, cmap='jet'), tf.image.resize_images(outputs, [224, 224])), name='outputs')) if fine_tuned == 1: vars = [ var for var in tf.trainable_variables() if "dil" in var.name ] else: train_vars = tf.trainable_variables() vars = [ var for var in train_vars if "model" in var.name ] # equal: vars2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, # scope='tower_{}/model'.format(i)) grads = optimizer.compute_gradients(loss, var_list=vars) tower_grads.append(grads) # 计算所有loss average_loss = average_losses(losses) average_mae_loss = average_losses(mae_losses) average_mse_loss = average_losses(mse_losses) # cpu 上计算平均梯度 grads = average_gradients(tower_grads) # 更新 with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # calculate gradients train_op = optimizer.apply_gradients(grads, global_step=g_step) # add history for variables and gradients in genrator for var in vars: summaries.append(tf.summary.histogram('Model/' + var.op.name, var)) for grad, var in grads: if grad is not None: summaries.append( tf.summary.histogram('Model/' + var.op.name + '/gradients', grad)) # create a saver saver = tf.train.Saver(max_to_keep=1) # saver_for_best = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=3) saver_for_best = tf.train.Saver(max_to_keep=3) if use_trick: saver_for_trick = tf.train.Saver(var_list=tf.global_variables(), max_to_keep=1) # build summary summary_op = tf.summary.merge(summaries) # start training session # "allow_soft_placement" must be set to True to build towers on GPU, # as some of the ops do not have GPU implementations. # "log_device_placement" set to True will print device place config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True if for_remote != 1: # fraction of overall amount of memory that each GPU should be allocated config.gpu_options.per_process_gpu_memory_fraction = 0.8 tf_sess = tf.Session(config=config) K_B.set_session(tf_sess) with K_B.get_session() as sess: # summaries summary_path = os.path.join(summary_dir, model_name) summary_writer = tf.summary.FileWriter(summary_path, graph=sess.graph) init_or_restore(sess, saver, ckpt_path, logger) volumes_per_step = small_batch * gpu_nums step = -1 min_train_loss = np.inf min_train_loss_step = 0 patient = 0 if type(cfg.epochs) is int: for epoch in range(cfg.epochs): print("EPOCH: {}".format(epoch + 1)) logger.info("EPOCH: {}".format(epoch + 1)) sess.run(iterator.initializer) while True: try: step += 1 # tf.cod的validate分支也会get_next, 避免先用完导致跳出循环 _, average_loss_v, average_mae_loss_v, average_mse_loss_v, g_step_v, \ summary_str = sess.run( [train_op, average_loss, average_mae_loss, average_mse_loss, g_step, summary_op]) assert (not np.isnan(average_loss_v)), 'Model diverged with ' \ 'loss = NaN' # duration = time.time() - start_time # batch_per_sec = volumes_per_step / duration if step % 10 == 0: print("----- step:{} train loss:{:04f}".format( step, average_loss_v)) print("----- step:{} train mae_loss:{:04f}".format( step, average_mae_loss_v)) print("----- step:{} train mse_loss:{:04f}".format( step, average_mse_loss_v)) logger.info( "----- step:{} train loss:{:04f}".format( step, average_loss_v)) logger.info( "----- step:{} train mae_loss:{:04f}".format( step, average_mae_loss_v)) logger.info( "----- step:{} train mse_loss:{:04f}".format( step, average_mse_loss_v)) if step % 100 == 0: summary_writer.add_summary(summary_str, step) if step % 1000 == 0: saver.save(sess, ckpt_path, global_step=step) except tf.errors.OutOfRangeError: print('train dataset finished') logger.info('train dataset finished') break # catch all other error except BaseException as e: print('[!!!]An exception occurred: {}'.format(e)) if average_loss_v < min_train_loss: min_train_loss = average_loss_v min_train_loss_step = step patient = 0 saver_for_best.save(sess, best_ckpt_path, global_step=step) print( "update best model, min_train_loss is [{:04f}] in step [{}]" .format(average_loss_v, step)) logger.info( "update best model, min_train_loss is [{:04f}] in step [{}]" .format(average_loss_v, step)) else: patient += 1 if patient >= cfg.patient: print( '[!!!] Early stop for no reducing in train loss ' + 'after [{}] epochs, '.format(cfg.patient) + 'min_train_loss is [{:04f}]'.format(min_train_loss) + ', in step [{}]'.format(min_train_loss_step)) print( 'mae_loss is [{:04f}]'.format(average_mae_loss_v)) print( 'mse_loss is [{:04f}]'.format(average_mse_loss_v)) logger.warning( '[!!!] Early stop for no reducing in train loss ' + 'after [{}] epochs, '.format(cfg.patient) + 'min_train_loss is [{:04f}]'.format(min_train_loss) + ', in step [{}]'.format(min_train_loss_step)) logger.warning( 'mae_loss is [{:04f}]'.format(average_mae_loss_v)) logger.warning( 'mse_loss is [{:04f}]'.format(average_mse_loss_v)) break print('[!!!] model name:{}'.format(model_name)) logger.info('[!!!] model name:{}'.format(model_name))
def train_ops(points, covars, labels, true_state, is_training, num_classes, voxel_num, epoch_batch_num): ops = {} with tf.device('/cpu:0'): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) decay_steps = epoch_batch_num * FLAGS.decay_epoch lr = tf.train.exponential_decay(FLAGS.lr_init, global_step, decay_steps, FLAGS.decay_rate, staircase=True) lr = tf.maximum(FLAGS.lr_clip, lr) tf.summary.scalar('learning rate', lr) opt = tf.train.AdamOptimizer(lr) with tf.name_scope('split_data'): tower_points = tf.split(points, FLAGS.num_gpus) tower_covars = tf.split(covars, FLAGS.num_gpus) tower_labels = tf.split(labels, FLAGS.num_gpus) tower_true_state = tf.split(true_state, FLAGS.num_gpus) reuse = False tower_grads = [] tower_recon_losses = [] tower_voxel_state = [] for i in range(FLAGS.num_gpus): with tf.device('/gpu:{}'.format(i)): with tf.name_scope('tower_{}'.format(i)): recon_loss,voxel_state\ =tower_loss(tower_points[i], tower_covars[i],tower_labels[i], tower_true_state[i], is_training, num_classes,voxel_num,reuse) grad = opt.compute_gradients(recon_loss) tower_grads.append(grad) tower_recon_losses.append(recon_loss) tower_voxel_state.append(voxel_state) reuse = True avg_grad = average_gradients(tower_grads) update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_op): apply_grad_op = opt.apply_gradients(avg_grad, global_step=global_step) summaries = tf.get_collection(tf.GraphKeys.SUMMARIES) summary_op = tf.summary.merge(summaries) total_recon_loss_op = tf.add_n(tower_recon_losses) / FLAGS.num_gpus voxel_state_op = tf.concat(tower_voxel_state, axis=0) ops['total_recon_loss'] = total_recon_loss_op ops['apply_grad'] = apply_grad_op ops['summary'] = summary_op ops['global_step'] = global_step ops['voxel_state'] = voxel_state_op return ops