def __init__(self, model_config, train_config, num_classes, mode): self.model_config = model_config self.train_config = train_config self.num_classes = num_classes self.mode = mode assert mode in ['train', 'validation', 'inference', 'test'] if self.mode == 'train': self.data_config = self.train_config['train_data_config'] elif self.mode == 'validation': self.data_config = self.train_config['validation_data_config'] elif self.mode == 'test': self.data_config = self.train_config['test_data_config'] self.images = None self.images_feed = None self.labels = None self.net = None self.sup1 = None self.sup2 = None self.init_fn = None self.loss = None self.total_loss = [] self.response = None with tf.device("/cpu:0"): self.dataset = DataLoader(self.data_config, self.train_config['DataSet'], self.train_config['dataset_dir'], self.train_config['class_dict'])
def build_inputs(self): """Input fetching and batching Outputs: self.images: image batch of shape [batch, hz, wz, 3] labels: image batch of shape [batch, hx, wx, num_classes] """ if self.mode in ['train', 'validation', 'test']: # Put data loading and preprocessing in CPU is substantially faster # DataSet prepare with tf.device("/cpu:0"): dataset = DataLoader(self.data_config, self.train_config['DataSet'], self.train_config['class_dict']) self.images, labels = dataset.get_one_batch() self.labels = tf.one_hot(labels, self.num_classes) else: self.images_feed = tf.placeholder(shape=[None, None, None, 3], dtype=tf.uint8, name='images_input') self.images = tf.to_float(self.images_feed)/255
class BiseNet(object): def __init__(self, model_config, train_config, num_classes, mode): self.model_config = model_config self.train_config = train_config self.num_classes = num_classes self.mode = mode assert mode in ['train', 'validation', 'inference', 'test'] if self.mode == 'train': self.data_config = self.train_config['train_data_config'] elif self.mode == 'validation': self.data_config = self.train_config['validation_data_config'] elif self.mode == 'test': self.data_config = self.train_config['test_data_config'] self.images = None self.images_feed = None self.labels = None self.net = None self.sup1 = None self.sup2 = None self.init_fn = None self.loss = None self.total_loss = [] self.response = None with tf.device("/cpu:0"): self.dataset = DataLoader(self.data_config, self.train_config['DataSet'], self.train_config['dataset_dir'], self.train_config['class_dict']) def build_inputs(self): """Input fetching and batching Outputs: self.images: image batch of shape [batch, hz, wz, 3] labels: image batch of shape [batch, hx, wx, num_classes] """ if self.mode in ['train', 'validation', 'test']: # Put data loading and preprocessing in CPU is substantially faster # DataSet prepare self.images, labels = self.dataset.get_one_batch() # labels = tf.Print(labels, [tf.unique(tf.reshape(labels,[-1,]))[0]], message="labels:", summarize=10) self.labels = tf.one_hot(labels, self.num_classes) else: self.images_feed = tf.placeholder(shape=[None, None, None, 3], dtype=tf.uint8, name='images_input') self.images = tf.to_float(self.images_feed) / 255 def is_training(self): """Returns true if the model is built for training mode""" return self.mode == 'train' def setup_global_step(self): global_step = tf.Variable(initial_value=0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES ]) self.global_step = global_step def build_bisenet_custom(self, reuse=False): """ Builds the BiSeNet model. Arguments: reuse: Reuse variable or not Returns: BiSeNet model """ ### The spatial path ### The number of feature maps for each convolution is not specified in the paper ### It was chosen here to be equal to the number of feature maps of a classification ### model at each corresponding stage batch_norm_params = self.model_config['batch_norm_params'] init_method = self.model_config['conv_config']['init_method'] down_16x_end_points = self.model_config['net_node']['16xdown:50'] down_32x_end_points = self.model_config['net_node']['32xdown:25'] if init_method == 'kaiming_normal': initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False) else: initializer = slim.xavier_initializer() with tf.variable_scope('spatial_net', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): # inference/spatial_net/Conv/Conv2D run 1 average cost 250.552994 ms, 25.405 %, FlopsRate: 9.064 % # conv2d spatial_net = slim.conv2d(self.images, 16, [3, 3], stride=[2, 2], activation_fn=None) spatial_net = hard_swish( slim.batch_norm(spatial_net, fused=True)) # bneck1 exp_size = _make_divisible(16) spatial_net = slim.conv2d(spatial_net, exp_size, [1, 1], stride=[1, 1], activation_fn=None) spatial_net = slim.batch_norm(spatial_net, fused=True) spatial_net = DepthSepConv(spatial_net, 16, kernel=[3, 3], stride=2) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) # bneck2 exp_size = _make_divisible(72) spatial_net = slim.conv2d(spatial_net, exp_size, [1, 1], stride=[1, 1], activation_fn=None) spatial_net = slim.batch_norm(spatial_net, fused=True) spatial_net = DepthSepConv(spatial_net, 24, kernel=[3, 3], stride=2) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) # bneck3 exp_size = _make_divisible(88) spatial_net = slim.conv2d(spatial_net, exp_size, [1, 1], stride=[1, 1], activation_fn=None) spatial_net = slim.batch_norm(spatial_net, fused=True) spatial_net = DepthSepConv(spatial_net, 24, kernel=[3, 3], stride=1) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) # bneck4 exp_size = _make_divisible(96) spatial_net = slim.conv2d(spatial_net, exp_size, [1, 1], stride=[1, 1], activation_fn=None) spatial_net = slim.batch_norm(spatial_net, fused=True) spatial_net = DepthSepConv(spatial_net, 40, kernel=[3, 3], stride=1) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) # bneck5 spatial_net = DepthSepConv(spatial_net, 80, kernel=[3, 3], stride=1) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) # bneck6 spatial_net = DepthSepConv(spatial_net, 128, kernel=[3, 3], stride=1) spatial_net = tf.nn.relu( slim.batch_norm(spatial_net, fused=True)) frontend_config = self.model_config['frontend_config'] ### Context path logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend( self.images, frontend_config, self.is_training(), reuse) ### Combining the paths with tf.variable_scope('combine_path', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): # tail part global_context = tf.reduce_mean( end_points[down_32x_end_points], [1, 2], keep_dims=True) global_context = slim.conv2d(global_context, 128, 1, [1, 1], activation_fn=None) global_context = tf.nn.relu( slim.batch_norm(global_context, fused=True)) ARM_out1 = AttentionRefinementModule_Custom( end_points[down_32x_end_points], n_filters=128) ARM_out2 = AttentionRefinementModule_Custom( end_points[down_16x_end_points], n_filters=128) ARM_out1 = tf.add(ARM_out1, global_context) ARM_out1 = Upsampling(ARM_out1, scale=2) # inference/combine_path/Conv_6/Conv2D run 1 average cost 23.034000 ms, 2.336 %, FlopsRate: 8.879 % exp_size = _make_divisible(256) ARM_out1 = slim.conv2d(ARM_out1, exp_size, [1, 1], stride=[1, 1], activation_fn=None) ARM_out1 = slim.batch_norm(ARM_out1, fused=True) ARM_out1 = DepthSepConv(ARM_out1, 128, kernel=[3, 3], stride=1) ARM_out1 = tf.nn.relu(slim.batch_norm(ARM_out1, fused=True)) ARM_out2 = tf.add(ARM_out2, ARM_out1) ARM_out2 = Upsampling(ARM_out2, scale=2) # inference/combine_path/Conv_13/Conv2D run 1 average cost 23.034000 ms, 2.336 %, FlopsRate: 8.879 % exp_size = _make_divisible(256) ARM_out2 = slim.conv2d(ARM_out2, exp_size, [1, 1], stride=[1, 1], activation_fn=None) ARM_out2 = slim.batch_norm(ARM_out2, fused=True) ARM_out2 = DepthSepConv(ARM_out2, 128, kernel=[3, 3], stride=1) ARM_out2 = tf.nn.relu(slim.batch_norm(ARM_out2, fused=True)) context_net = ARM_out2 FFM_out = FeatureFusionModule_Custom(input_1=spatial_net, input_2=context_net, n_filters=256) ARM_out1 = ConvBlock(ARM_out1, n_filters=128, kernel_size=[3, 3]) ARM_out2 = ConvBlock(ARM_out2, n_filters=128, kernel_size=[3, 3]) exp_size = _make_divisible(128) FFM_out = slim.conv2d(FFM_out, exp_size, [1, 1], stride=[1, 1], activation_fn=None) FFM_out = slim.batch_norm(FFM_out, fused=True) FFM_out = DepthSepConv(FFM_out, 64, kernel=[3, 3], stride=1) FFM_out = tf.nn.relu(slim.batch_norm(FFM_out, fused=True)) # Upsampling + dilation or only Upsampling FFM_out = Upsampling(FFM_out, scale=2) # inference/combine_path/Conv_12/Conv2D run 1 average cost 32.151001 ms, 3.260 %, FlopsRate: 8.879 % exp_size = _make_divisible(128) FFM_out = slim.conv2d(FFM_out, exp_size, [1, 1], stride=[1, 1], activation_fn=None) FFM_out = DepthSepConv(FFM_out, 64, kernel=[3, 3], stride=1, rate=2) FFM_out = tf.nn.relu(slim.batch_norm(FFM_out, fused=True)) FFM_out = slim.conv2d(FFM_out, self.num_classes, [1, 1], activation_fn=None, scope='logits') self.net = Upsampling(FFM_out, 4) if self.mode in ['train', 'validation', 'test']: sup1 = slim.conv2d(ARM_out1, self.num_classes, [1, 1], activation_fn=None, scope='supl1') sup2 = slim.conv2d(ARM_out2, self.num_classes, [1, 1], activation_fn=None, scope='supl2') self.sup1 = Upsampling(sup1, scale=16) self.sup2 = Upsampling(sup2, scale=8) self.init_fn = init_fn def build_bisenet(self, reuse=False): """ Builds the BiSeNet model. Arguments: reuse: Reuse variable or not Returns: BiSeNet model """ ### The spatial path ### The number of feature maps for each convolution is not specified in the paper ### It was chosen here to be equal to the number of feature maps of a classification ### model at each corresponding stage batch_norm_params = self.model_config['batch_norm_params'] init_method = self.model_config['conv_config']['init_method'] down_16x_end_points = self.model_config['net_node']['16xdown:50'] down_32x_end_points = self.model_config['net_node']['32xdown:25'] if init_method == 'kaiming_normal': initializer = slim.variance_scaling_initializer(factor=2.0, mode='FAN_IN', uniform=False) else: initializer = slim.xavier_initializer() with tf.variable_scope('spatial_net', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): #print("*"*20) print(self.images) #print("*" * 20) spatial_net = ConvBlock(self.images, n_filters=64, kernel_size=[7, 7], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=64, kernel_size=[3, 3], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=64, kernel_size=[3, 3], strides=2) spatial_net = ConvBlock(spatial_net, n_filters=128, kernel_size=[1, 1]) frontend_config = self.model_config['frontend_config'] ### Context path logits, end_points, frontend_scope, init_fn = frontend_builder.build_frontend( self.images, frontend_config, self.is_training(), reuse) ### Combining the paths with tf.variable_scope('combine_path', reuse=reuse): with slim.arg_scope([slim.conv2d], biases_initializer=None, weights_initializer=initializer): with slim.arg_scope([slim.batch_norm], is_training=self.is_training(), **batch_norm_params): # tail part size = tf.shape(end_points[down_32x_end_points])[1:3] global_context = tf.reduce_mean( end_points[down_32x_end_points], [1, 2], keep_dims=True) global_context = slim.conv2d(global_context, 128, 1, [1, 1], activation_fn=None) global_context = tf.nn.relu( slim.batch_norm(global_context, fused=True)) net_5 = AttentionRefinementModule( end_points[down_32x_end_points], n_filters=128) net_4 = AttentionRefinementModule( end_points[down_16x_end_points], n_filters=128) net_5 = tf.add(net_5, global_context) net_5 = Upsampling(net_5, scale=2) net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3]) #net_4=net_5 net_4 = tf.add(net_4, net_5) net_4 = Upsampling(net_4, scale=2) net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3]) context_net = net_4 net = FeatureFusionModule(input_1=spatial_net, input_2=context_net, n_filters=256) net_5 = ConvBlock(net_5, n_filters=128, kernel_size=[3, 3]) net_4 = ConvBlock(net_4, n_filters=128, kernel_size=[3, 3]) net = ConvBlock(net, n_filters=64, kernel_size=[3, 3]) # Upsampling + dilation or only Upsampling net = Upsampling(net, scale=2) net = slim.conv2d(net, 64, [3, 3], rate=2, activation_fn=tf.nn.relu, biases_initializer=None, normalizer_fn=slim.batch_norm) net = slim.conv2d(net, self.num_classes, [1, 1], activation_fn=None, scope='logits') self.net = Upsampling(net, 4) # net = slim.conv2d(net, self.num_classes, [1, 1], activation_fn=None, scope='logits') # self.net = Upsampling(net, scale=8) if self.mode in ['train', 'validation', 'test']: sup1 = slim.conv2d(net_5, self.num_classes, [1, 1], activation_fn=None, scope='supl1') sup2 = slim.conv2d(net_4, self.num_classes, [1, 1], activation_fn=None, scope='supl2') self.sup1 = Upsampling(sup1, scale=16) self.sup2 = Upsampling(sup2, scale=8) self.init_fn = init_fn def build_loss(self): # self.labels = tf.Print(self.labels, [tf.unique(tf.reshape(self.labels, (-1,)))[0]], message="label:", summarize=10) loss1 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.net, labels=self.labels)) loss2 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.sup1, labels=self.labels)) loss3 = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=self.sup2, labels=self.labels)) loss = tf.add_n([loss1, loss2, loss3]) # self.loss = loss1 # self.total_loss = tf.losses.get_total_loss() return loss def summarize(self): shape = tf.shape(self.labels) # Tensorboard inspection tf.summary.image('image', self.images, family=self.mode, max_outputs=1) # tf.Print(self.labels, [tf.shape(self.labels)], message="label size:", summarize=10) color_map = colors_dict[self.train_config['DataSet']] tf.summary.image('GT', tf.reshape( tf.matmul( tf.reshape(self.labels, [-1, self.num_classes]), color_map), [-1, shape[1], shape[2], 3]), family=self.mode, max_outputs=1) tf.summary.image('response', tf.reshape( tf.matmul( tf.reshape( tf.one_hot(tf.argmax(self.net, -1), self.num_classes), [-1, self.num_classes]), color_map), [-1, shape[1], shape[2], 3]), family=self.mode, max_outputs=1) tf.summary.scalar('total_loss', tf.reduce_mean(self.total_loss), family=self.mode) # tf.summary.scalar('loss', self.loss, family=self.mode) accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( tf.argmax(self.net, -1), tf.argmax(self.labels, -1)) mean_IOU, mean_IOU_update = tf.contrib.metrics.streaming_mean_iou( predictions=tf.argmax(self.net, -1), labels=tf.argmax(self.labels, -1), num_classes=self.num_classes) with tf.control_dependencies([accuracy_update, mean_IOU_update]): tf.summary.scalar('accuracy', accuracy, family=self.mode) tf.summary.scalar('mean_IOU', mean_IOU, family=self.mode) def predict(self): self.response = self.net def build(self, num_gpus=1, reuse=False): """Creates all ops for training and evaluation""" with tf.name_scope(self.mode): if self.mode in ['train', 'validation', 'test']: tower_losses = [] for i in range(num_gpus): self.build_inputs() with tf.device('/gpu:%d' % i): # First tower has default name scope. name_scope = ('clone_%d' % i) if i else '' with tf.name_scope(name_scope) as scope: with tf.variable_scope( tf.get_variable_scope(), reuse=True if i != 0 else None): if self.model_config['use_custom']: self.build_bisenet_custom(reuse=reuse) else: self.build_bisenet(reuse=reuse) loss = self.build_loss() self.total_loss.append(loss) with tf.device('/cpu:0'): self.summarize() else: self.build_inputs() self.build_bisenet(reuse=reuse) self.predict() if self.is_training(): self.setup_global_step()