def generate_tfrecords(): """ :return: """ io = cityscapes_tf_io.CityScapesTfIO(cfg=CFG) io.writer.write_tfrecords() return
def __init__(self): """ initialize bisenetv2 trainner """ # define solver params and dataset self._cityscapes_io = cityscapes_tf_io.CityScapesTfIO() self._train_dataset = self._cityscapes_io.train_dataset_reader self._steps_per_epoch = len(self._train_dataset) self._model_name = CFG.MODEL.MODEL_NAME self._train_epoch_nums = CFG.TRAIN.EPOCH_NUMS self._batch_size = CFG.TRAIN.BATCH_SIZE self._snapshot_epoch = CFG.TRAIN.SNAPSHOT_EPOCH self._model_save_dir = ops.join(CFG.TRAIN.MODEL_SAVE_DIR, self._model_name) self._tboard_save_dir = ops.join(CFG.TRAIN.TBOARD_SAVE_DIR, self._model_name) self._enable_miou = CFG.TRAIN.COMPUTE_MIOU.ENABLE if self._enable_miou: self._record_miou_epoch = CFG.TRAIN.COMPUTE_MIOU.EPOCH self._input_tensor_size = [int(tmp / 2) for tmp in CFG.AUG.TRAIN_CROP_SIZE] self._init_learning_rate = CFG.SOLVER.LR self._moving_ave_decay = CFG.SOLVER.MOVING_AVE_DECAY self._momentum = CFG.SOLVER.MOMENTUM self._lr_polynimal_decay_power = CFG.SOLVER.LR_POLYNOMIAL_POWER self._optimizer_mode = CFG.SOLVER.OPTIMIZER.lower() if CFG.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: self._initial_weight = CFG.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH else: self._initial_weight = None if CFG.TRAIN.WARM_UP.ENABLE: self._warmup_epoches = CFG.TRAIN.WARM_UP.EPOCH_NUMS self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: self._warmup_epoches = 0 # define tensorflow session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = CFG.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = CFG.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor with tf.variable_scope(name_or_scope='graph_input_node'): self._input_src_image, self._input_label_image = self._train_dataset.next_batch( batch_size=self._batch_size ) # define model loss self._model = bisenet_v2.BiseNetV2(phase='train', cfg=CFG) loss_set = self._model.compute_loss( input_tensor=self._input_src_image, label_tensor=self._input_label_image, name='BiseNetV2', reuse=False ) self._prediciton = self._model.inference( input_tensor=self._input_src_image, name='BiseNetV2', reuse=True ) self._loss = loss_set['total_loss'] self._l2_loss = loss_set['l2_loss'] # define miou if self._enable_miou: with tf.variable_scope('miou'): pred = tf.reshape(self._prediciton, [-1, ]) gt = tf.reshape(self._input_label_image, [-1, ]) indices = tf.squeeze(tf.where(tf.less_equal(gt, CFG.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=CFG.DATASET.NUM_CLASSES ) # define learning rate with tf.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') warmup_steps = tf.constant( self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps' ) train_steps = tf.constant( self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps' ) self._learn_rate = tf.cond( pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr(warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000001, power=self._lr_polynimal_decay_power) ) self._learn_rate = tf.identity(self._learn_rate, 'lr') global_step_update = tf.assign_add(self._global_step, 1.0) # define moving average op with tf.variable_scope(name_or_scope='moving_avg'): if CFG.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # define training op with tf.variable_scope(name_or_scope='train_step'): if CFG.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() if self._optimizer_mode == 'sgd': optimizer = tf.train.MomentumOptimizer( learning_rate=self._learn_rate, momentum=self._momentum ) elif self._optimizer_mode == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self._learn_rate, ) else: raise ValueError('Not support optimizer: {:s}'.format(self._optimizer_mode)) optimize_op = optimizer.minimize(self._loss, var_list=train_var_list) with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies([optimize_op, global_step_update]): with tf.control_dependencies([moving_ave_op]): self._train_op = tf.no_op() # define saver and loader with tf.variable_scope('loader_and_saver'): self._net_var = [vv for vv in tf.global_variables() if 'lr' not in vv.name] self._loader = tf.train.Saver(self._net_var) self._saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) # define summary with tf.variable_scope('summary'): summary_merge_list = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total", self._loss), tf.summary.scalar('l2_loss', self._l2_loss) ] if self._enable_miou: with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total", self._loss), tf.summary.scalar('l2_loss', self._l2_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge(summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): shutil.rmtree(self._tboard_save_dir) os.makedirs(self._tboard_save_dir, exist_ok=True) model_params_file_save_path = ops.join(self._tboard_save_dir, CFG.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: CFG.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) self._summary_writer = tf.summary.FileWriter(self._tboard_save_dir, graph=self._sess.graph) LOG.info('Initialize cityscapes bisenetv2 trainner complete')
def __init__(self, cfg): """ initialize sfnet multi gpu trainner """ self._cfg = cfg # define solver params and dataset self._cityscapes_io = cityscapes_tf_io.CityScapesTfIO(cfg=cfg) self._train_dataset = self._cityscapes_io.train_dataset_reader self._val_dataset = self._cityscapes_io.val_dataset_reader self._steps_per_epoch = len(self._train_dataset) self._val_steps_per_epoch = len(self._val_dataset) self._model_name = self._cfg.MODEL.MODEL_NAME self._train_epoch_nums = self._cfg.TRAIN.EPOCH_NUMS self._batch_size = self._cfg.TRAIN.BATCH_SIZE self._val_batch_size = self._cfg.TRAIN.VAL_BATCH_SIZE self._snapshot_epoch = self._cfg.TRAIN.SNAPSHOT_EPOCH self._model_save_dir = ops.join(self._cfg.TRAIN.MODEL_SAVE_DIR, self._model_name) self._tboard_save_dir = ops.join(self._cfg.TRAIN.TBOARD_SAVE_DIR, self._model_name) self._enable_miou = self._cfg.TRAIN.COMPUTE_MIOU.ENABLE if self._enable_miou: self._record_miou_epoch = self._cfg.TRAIN.COMPUTE_MIOU.EPOCH self._gpu_devices = self._cfg.TRAIN.MULTI_GPU.GPU_DEVICES self._gpu_nums = len(self._gpu_devices) self._chief_gpu_index = self._cfg.TRAIN.MULTI_GPU.CHIEF_DEVICE_INDEX self._batch_size_per_gpu = int(self._batch_size / self._gpu_nums) self._init_learning_rate = self._cfg.SOLVER.LR self._moving_ave_decay = self._cfg.SOLVER.MOVING_AVE_DECAY self._momentum = self._cfg.SOLVER.MOMENTUM self._lr_polynimal_decay_power = self._cfg.SOLVER.LR_POLYNOMIAL_POWER self._optimizer_mode = self._cfg.SOLVER.OPTIMIZER.lower() if self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.ENABLE: self._initial_weight = self._cfg.TRAIN.RESTORE_FROM_SNAPSHOT.SNAPSHOT_PATH else: self._initial_weight = None if self._cfg.TRAIN.WARM_UP.ENABLE: self._warmup_epoches = self._cfg.TRAIN.WARM_UP.EPOCH_NUMS self._warmup_init_learning_rate = self._init_learning_rate / 1000.0 else: self._warmup_epoches = 0 # define tensorflow session sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.per_process_gpu_memory_fraction = self._cfg.GPU.GPU_MEMORY_FRACTION sess_config.gpu_options.allow_growth = self._cfg.GPU.TF_ALLOW_GROWTH sess_config.gpu_options.allocator_type = 'BFC' self._sess = tf.Session(config=sess_config) # define graph input tensor with tf.variable_scope(name_or_scope='graph_input_node'): self._input_src_image_list = [] self._input_label_image_list = [] for i in range(self._gpu_nums): src_imgs, label_imgs = self._train_dataset.next_batch( batch_size=self._batch_size_per_gpu) self._input_src_image_list.append(src_imgs) self._input_label_image_list.append(label_imgs) self._val_input_src_image, self._val_input_label_image = self._val_dataset.next_batch( batch_size=self._val_batch_size) # define model self._model = resnet_fcn.ResNetFCN(phase='train', cfg=self._cfg) self._val_model = resnet_fcn.ResNetFCN(phase='test', cfg=self._cfg) # define average container tower_grads = [] tower_total_loss = [] tower_l2_loss = [] batchnorm_updates = None # define learning rate with tf.variable_scope('learning_rate'): self._global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='global_step') self._val_global_step = tf.Variable(1.0, dtype=tf.float32, trainable=False, name='val_global_step') self._val_global_step_update = tf.assign_add( self._val_global_step, 1.0) warmup_steps = tf.constant(self._warmup_epoches * self._steps_per_epoch, dtype=tf.float32, name='warmup_steps') train_steps = tf.constant(self._train_epoch_nums * self._steps_per_epoch, dtype=tf.float32, name='train_steps') self._learn_rate = tf.cond( pred=self._global_step < warmup_steps, true_fn=lambda: self._compute_warmup_lr( warmup_steps=warmup_steps, name='warmup_lr'), false_fn=lambda: tf.train.polynomial_decay( learning_rate=self._init_learning_rate, global_step=self._global_step, decay_steps=train_steps, end_learning_rate=0.000000001, power=self._lr_polynimal_decay_power)) self._learn_rate = tf.identity(self._learn_rate, 'lr') # define optimizer if self._optimizer_mode == 'sgd': optimizer = tf.train.MomentumOptimizer( learning_rate=self._learn_rate, momentum=self._momentum) elif self._optimizer_mode == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=self._learn_rate, ) else: raise NotImplementedError( 'Not support optimizer: {:s} for now'.format( self._optimizer_mode)) # define distributed train op with tf.variable_scope(tf.get_variable_scope()): is_network_initialized = False for i in range(self._gpu_nums): with tf.device('/gpu:{:d}'.format(i)): with tf.name_scope('tower_{:d}'.format(i)) as _: input_images = self._input_src_image_list[i] input_labels = self._input_label_image_list[i] tmp_loss, tmp_grads = self._compute_net_gradients( input_images, input_labels, optimizer, is_net_first_initialized=is_network_initialized) is_network_initialized = True # Only use the mean and var in the chief gpu tower to update the parameter if i == self._chief_gpu_index: batchnorm_updates = tf.get_collection( tf.GraphKeys.UPDATE_OPS) tower_grads.append(tmp_grads) tower_total_loss.append(tmp_loss['total_loss']) tower_l2_loss.append(tmp_loss['l2_loss']) grads = self._average_gradients(tower_grads) self._loss = tf.reduce_mean(tower_total_loss, name='reduce_mean_tower_total_loss') self._l2_loss = tf.reduce_mean(tower_l2_loss, name='reduce_mean_tower_l2_loss') ret = self._val_model.compute_loss( input_tensor=self._val_input_src_image, label_tensor=self._val_input_label_image, name='SFNet', reuse=True) self._val_loss = ret['total_loss'] self._val_l2_loss = ret['l2_loss'] # define moving average op with tf.variable_scope(name_or_scope='moving_avg'): if self._cfg.TRAIN.FREEZE_BN.ENABLE: train_var_list = [ v for v in tf.trainable_variables() if 'beta' not in v.name and 'gamma' not in v.name ] else: train_var_list = tf.trainable_variables() moving_ave_op = tf.train.ExponentialMovingAverage( self._moving_ave_decay).apply(train_var_list + tf.moving_average_variables()) # group all the op needed for training batchnorm_updates_op = tf.group(*batchnorm_updates) apply_gradient_op = optimizer.apply_gradients( grads, global_step=self._global_step) self._train_op = tf.group(apply_gradient_op, moving_ave_op, batchnorm_updates_op) # define prediction self._prediciton = self._model.inference( input_tensor=self._input_src_image_list[self._chief_gpu_index], name='SFNet', reuse=True) self._val_prediction = self._val_model.inference( input_tensor=self._val_input_src_image, name='SFNet', reuse=True) # define miou if self._enable_miou: with tf.variable_scope('miou'): pred = tf.reshape(self._prediciton, [ -1, ]) gt = tf.reshape( self._input_label_image_list[self._chief_gpu_index], [ -1, ]) indices = tf.squeeze( tf.where( tf.less_equal(gt, self._cfg.DATASET.NUM_CLASSES - 1)), 1) gt = tf.gather(gt, indices) pred = tf.gather(pred, indices) self._miou, self._miou_update_op = tf.metrics.mean_iou( labels=gt, predictions=pred, num_classes=self._cfg.DATASET.NUM_CLASSES) val_pred = tf.reshape(self._val_prediction, [ -1, ]) val_gt = tf.reshape(self._val_input_label_image, [ -1, ]) indices = tf.squeeze( tf.where( tf.less_equal(val_gt, self._cfg.DATASET.NUM_CLASSES - 1)), 1) val_gt = tf.gather(val_gt, indices) val_pred = tf.gather(val_pred, indices) self._val_miou, self._val_miou_update_op = tf.metrics.mean_iou( labels=val_gt, predictions=val_pred, num_classes=self._cfg.DATASET.NUM_CLASSES) # define saver and loader with tf.variable_scope('loader_and_saver'): self._net_var = [ vv for vv in tf.global_variables() if 'lr' not in vv.name ] self._loader = tf.train.Saver(self._net_var) self._saver = tf.train.Saver(max_to_keep=10) # define summary with tf.variable_scope('summary'): summary_merge_list = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total_loss", self._loss), tf.summary.scalar('l2_loss', self._l2_loss) ] val_summary_merge_list = [ tf.summary.scalar('val_total_loss', self._val_loss), tf.summary.scalar('val_l2_loss', self._val_l2_loss) ] if self._enable_miou: with tf.control_dependencies([self._miou_update_op]): summary_merge_list_with_miou = [ tf.summary.scalar("learn_rate", self._learn_rate), tf.summary.scalar("total_loss", self._loss), tf.summary.scalar('l2_loss', self._l2_loss), tf.summary.scalar('miou', self._miou) ] self._write_summary_op_with_miou = tf.summary.merge( summary_merge_list_with_miou) with tf.control_dependencies( [self._val_miou_update_op, self._val_global_step_update]): val_summary_merge_list_with_miou = [ tf.summary.scalar('val_total_loss', self._val_loss), tf.summary.scalar('val_l2_loss', self._val_l2_loss), tf.summary.scalar('val_miou', self._val_miou), ] self._val_write_summary_op_with_miou = tf.summary.merge( val_summary_merge_list_with_miou) if ops.exists(self._tboard_save_dir): shutil.rmtree(self._tboard_save_dir) os.makedirs(self._tboard_save_dir, exist_ok=True) model_params_file_save_path = ops.join( self._tboard_save_dir, self._cfg.TRAIN.MODEL_PARAMS_CONFIG_FILE_NAME) with open(model_params_file_save_path, 'w', encoding='utf-8') as f_obj: self._cfg.dump_to_json_file(f_obj) self._write_summary_op = tf.summary.merge(summary_merge_list) self._val_write_summary_op = tf.summary.merge( val_summary_merge_list) self._summary_writer = tf.summary.FileWriter( self._tboard_save_dir, graph=self._sess.graph) LOG.info( 'Initialize cityscapes resnet fcn multi gpu trainner complete')