def conv2_tran(batch_input, kernel=3, output_channel=64, stride=1, use_bias=True, scope='conv'): """Define the convolution transpose building block.""" with tf.variable_scope(scope): if use_bias: return slim.conv2d_transpose( batch_input, output_channel, [kernel, kernel], stride, 'SAME', data_format='NHWC', activation_fn=None, ) else: return slim.conv2d_transpose(batch_input, output_channel, [kernel, kernel], stride, 'SAME', data_format='NHWC', activation_fn=None, biases_initializer=None)
def conv2_tran(batch_input, kernel=3, output_channel=64, stride=1, use_bias=True, scope='conv'): # kernel: An integer specifying the width and height of the 2D convolution window with tf.variable_scope(scope): if use_bias: return slim.conv2d_transpose( batch_input, output_channel, [kernel, kernel], stride, 'SAME', data_format='NHWC', activation_fn=None, weights_initializer=tf.glorot_uniform_initializer()) else: return slim.conv2d_transpose( batch_input, output_channel, [kernel, kernel], stride, 'SAME', data_format='NHWC', activation_fn=None, weights_initializer=tf.glorot_uniform_initializer(), biases_initializer=None)
def ConvUpscaleBlock(inputs, n_filters, kernel_size=[3, 3], scale=2): """ Basic deconv block for GCN Apply Transposed Convolution for feature map upscaling """ net = slim.conv2d_transpose(inputs, n_filters, kernel_size=[3, 3], stride=[2, 2], activation_fn=None) return net
def deconv2d(input_, output_dim, ks=4, s=2, stddev=0.02, name="deconv2d"): with tf.variable_scope(name): # return slim.conv2d_transpose(input_, output_dim, ks, s, padding='SAME', activation_fn=None, # weights_initializer=tf.truncated_normal_initializer(stddev=stddev), # biases_initializer=None) return slim.conv2d_transpose(input_, output_dim, ks, s, padding='SAME', activation_fn=None, weights_initializer=tf.truncated_normal_initializer(stddev=stddev), biases_initializer=None, trainable=False)
def ConvUpscaleBlock(inputs, n_filters, kernel_size=[3, 3], scale=2): """ Basic conv transpose block for Encoder-Decoder upsampling Apply successivly Transposed Convolution, BatchNormalization, ReLU nonlinearity """ net = slim.conv2d_transpose(inputs, n_filters, kernel_size=[3, 3], stride=[scale, scale], activation_fn=None) net = tf.nn.relu(slim.batch_norm(net, fused=True)) return net
def prediction_layer(cfg, input, name, num_outputs): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], padding='SAME', activation_fn=None, normalizer_fn=None, weights_regularizer=tf.keras.regularizers.l2(0.5 * (cfg.weight_decay))): with tf.compat.v1.variable_scope(name): pred = slim.conv2d_transpose(input, num_outputs, kernel_size=[3, 3], stride=cfg.deconvolutionstride, scope='block4') return pred
def generator(inputs, channel=32, num_blocks=4, name='generator', reuse=False): with tf.compat.v1.variable_scope(name, reuse=reuse): x = slim.convolution2d(inputs, channel, [7, 7], activation_fn=None) x = tf.nn.leaky_relu(x) x = slim.convolution2d(x, channel * 2, [3, 3], stride=2, activation_fn=None) x = slim.convolution2d(x, channel * 2, [3, 3], activation_fn=None) x = tf.nn.leaky_relu(x) x = slim.convolution2d(x, channel * 4, [3, 3], stride=2, activation_fn=None) x = slim.convolution2d(x, channel * 4, [3, 3], activation_fn=None) x = tf.nn.leaky_relu(x) for idx in range(num_blocks): x = resblock(x, out_channel=channel * 4, name='block_{}'.format(idx)) x = slim.conv2d_transpose(x, channel * 2, [3, 3], stride=2, activation_fn=None) x = slim.convolution2d(x, channel * 2, [3, 3], activation_fn=None) x = tf.nn.leaky_relu(x) x = slim.conv2d_transpose(x, channel, [3, 3], stride=2, activation_fn=None) x = slim.convolution2d(x, channel, [3, 3], activation_fn=None) x = tf.nn.leaky_relu(x) x = slim.convolution2d(x, 3, [7, 7], activation_fn=None) #x = tf.clip_by_value(x, -0.999999, 0.999999) return x
def conv_transpose_block(inputs, n_filters, strides=2, filter_size=[3, 3], dropout_p=0.0): """ Basic conv transpose block for Encoder-Decoder upsampling Apply successivly Transposed Convolution, BatchNormalization, ReLU nonlinearity Dropout (if dropout_p > 0) on the inputs """ conv = slim.conv2d_transpose(inputs, n_filters, kernel_size=[3, 3], stride=[strides, strides]) out = tf.nn.relu(slim.batch_norm(conv, fused=True)) if dropout_p != 0.0: out = slim.dropout(out, keep_prob=(1.0-dropout_p)) return out
def deconv2d(input_, output_dim, ks=4, s=2, stddev=0.02, name="deconv2d"): with tf.compat.v1.variable_scope(name): return slim.conv2d_transpose( input_, output_dim, ks, s, padding="SAME", activation_fn=None, weights_initializer=tf.compat.v1.truncated_normal_initializer(stddev=stddev), biases_initializer=None, )
def prediction_layer(cfg, input, name, num_outputs): with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], padding="SAME", activation_fn=None, normalizer_fn=None, weights_regularizer=slim.l2_regularizer(cfg["weight_decay"]), ): with tf.compat.v1.variable_scope(name): pred = slim.conv2d_transpose( input, num_outputs, kernel_size=[3, 3], stride=2, scope="block4" ) return pred
def TransitionUp(block_to_upsample, skip_connection, n_filters_keep, scope=None): """ Transition Up for FC-DenseNet Performs upsampling on block_to_upsample by a factor 2 and concatenates it with the skip_connection """ with tf.name_scope(scope) as sc: # Upsample l = slim.conv2d_transpose(block_to_upsample, n_filters_keep, kernel_size=[3, 3], stride=[2, 2], activation_fn=None) # Concatenate with skip connection l = tf.concat([l, skip_connection], axis=-1) return l
def resize_and_crop(net, scale, height, width): """Function to resize with conv2d_transpose and crop to [height, width] dimensions.""" weights_upsample = get_upsampling_weight(kernel_size=scale * 2) weights_init = tf.constant_initializer(value=weights_upsample) net = slim.conv2d_transpose(net, 1, scale * 2, stride=scale, weights_initializer=weights_init, biases_initializer=tf.constant_initializer(0.), activation_fn=None, normalizer_fn=None, trainable=False) _, h, w, _ = net.shape.as_list() if (h, w) != (height, width): net = tf.image.resize_with_crop_or_pad(net, height, width) return net
def predict(self, features, num_predictions_per_location=1): """Performs keypoint prediction. Args: features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: Int containing number of predictions per location. Returns: instance_masks: A float tensor of shape [batch_size, 1, num_keypoints, heatmap_height, heatmap_width]. Raises: ValueError: If num_predictions_per_location is not 1. """ if num_predictions_per_location != 1: raise ValueError( 'Only num_predictions_per_location=1 is supported') with slim.arg_scope(self._conv_hyperparams_fn()): net = slim.conv2d(features, self._keypoint_prediction_conv_depth, [3, 3], scope='conv_1') for i in range(1, self._keypoint_prediction_num_conv_layers): net = slim.conv2d(net, self._keypoint_prediction_conv_depth, [3, 3], scope='conv_%d' % (i + 1)) net = slim.conv2d_transpose(net, self._num_keypoints, [2, 2], scope='deconv1') heatmaps_mask = tf.image.resize_bilinear( net, [self._keypoint_heatmap_height, self._keypoint_heatmap_width], align_corners=True, name='upsample') return tf.expand_dims(tf.transpose(heatmaps_mask, perm=[0, 3, 1, 2]), axis=1, name='KeypointPredictor')
def upsample(net, num_outputs, kernel_size, method='nn_upsample_conv'): """Upsamples the given inputs. Args: net: A `Tensor` of size [batch_size, height, width, filters]. num_outputs: The number of output filters. kernel_size: A list of 2 scalars or a 1x2 `Tensor` indicating the scale, relative to the inputs, of the output dimensions. For example, if kernel size is [2, 3], then the output height and width will be twice and three times the input size. method: The upsampling method. Returns: An `Tensor` which was upsampled using the specified method. Raises: ValueError: if `method` is not recognized. """ net_shape = tf.shape(input=net) height = net_shape[1] width = net_shape[2] if method == 'nn_upsample_conv': net = tf.image.resize( net, [kernel_size[0] * height, kernel_size[1] * width], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) net = slim.conv2d(net, num_outputs, [4, 4], activation_fn=None) elif method == 'conv2d_transpose': net = slim.conv2d_transpose(net, num_outputs, [4, 4], stride=kernel_size, activation_fn=None) else: raise ValueError('Unknown method: [%s]' % method) return net
def disp_net(tgt_image, is_training=True): H = tgt_image.get_shape()[1].value W = tgt_image.get_shape()[2].value with tf.variable_scope('depth_net') as sc: end_points = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, outputs_collections=end_points): cnv1 = slim.conv2d(tgt_image, 32, [7, 7], stride=2, scope='cnv1') cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b') cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2') cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b') cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3') cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b') cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4') cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b') cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5') cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b') cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6') cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b') cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7') cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b') upcnv7 = slim.conv2d_transpose(cnv7b, 512, [3, 3], stride=2, scope='upcnv7') # There might be dimension mismatch due to uneven down/up-sampling upcnv7 = resize_like(upcnv7, cnv6b) i7_in = tf.concat([upcnv7, cnv6b], axis=3) icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7') upcnv6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6') upcnv6 = resize_like(upcnv6, cnv5b) i6_in = tf.concat([upcnv6, cnv5b], axis=3) icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6') upcnv5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5') upcnv5 = resize_like(upcnv5, cnv4b) i5_in = tf.concat([upcnv5, cnv4b], axis=3) icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5') upcnv4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4') i4_in = tf.concat([upcnv4, cnv3b], axis=3) icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4') disp4 = DISP_SCALING * slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp4') + MIN_DISP disp4_up = tf.image.resize_bilinear( disp4, [np.int(H / 4), np.int(W / 4)]) upcnv3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3') i3_in = tf.concat([upcnv3, cnv2b, disp4_up], axis=3) icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3') disp3 = DISP_SCALING * slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp3') + MIN_DISP disp3_up = tf.image.resize_bilinear( disp3, [np.int(H / 2), np.int(W / 2)]) upcnv2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2') i2_in = tf.concat([upcnv2, cnv1b, disp3_up], axis=3) icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2') disp2 = DISP_SCALING * slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp2') + MIN_DISP disp2_up = tf.image.resize_bilinear(disp2, [H, W]) upcnv1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1') i1_in = tf.concat([upcnv1, disp2_up], axis=3) icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1') disp1 = DISP_SCALING * slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp1') + MIN_DISP # I erased the old conversion using collection_to_dict return [disp1, disp2, disp3, disp4], end_points
def prediction_layers( self, features, end_points, input_shape, scope="pose", reuse=None, ): net_type = self.cfg['net_type'] if self.cfg[ 'multi_stage']: # MuNet! (multi_stage decoder + multi_fusion) # Defining multi_fusion backbone num_layers = re.findall("resnet_([0-9]*)", net_type)[0] layer_name = ("resnet_v1_{}".format(num_layers) + "/block{}/unit_{}/bottleneck_v1") mid_pt_block1 = layer_name.format(1, 3) mid_pt_block2 = layer_name.format(2, 3) final_dims = tf.math.ceil( tf.divide(input_shape[1:3], tf.convert_to_tensor(16))) interim_dims_s8 = tf.scalar_mul(2, final_dims) interim_dims_s8 = tf.cast(interim_dims_s8, tf.int32) interim_dims_s4 = tf.scalar_mul(2, interim_dims_s8) interim_dims_s4 = tf.cast(interim_dims_s4, tf.int32) bank_1 = end_points[mid_pt_block1] bank_2 = end_points[mid_pt_block2] bank_2_s8 = tf.compat.v1.image.resize_images( bank_2, interim_dims_s8) bank_1_s4 = tf.compat.v1.image.resize_images( bank_1, interim_dims_s4) with slim.arg_scope( [slim.conv2d], padding="SAME", normalizer_fn=slim.layers.batch_norm, activation_fn=tf.nn.relu, weights_regularizer=slim.l2_regularizer( self.cfg["weight_decay"]), ): with tf.compat.v1.variable_scope("decoder_filters"): bank_2_s16 = slim.conv2d( bank_2_s8, 512, kernel_size=[3, 3], stride=2, scope="decoder_parallel_1", ) bank_2_s16 = slim.conv2d( bank_2_s16, 128, kernel_size=[1, 1], stride=1, scope="decoder_parallel_2", ) bank_1_s8 = slim.conv2d( bank_1_s4, 256, kernel_size=[3, 3], stride=2, scope="decoder_parallel_3", ) bank_1_s16 = slim.conv2d( bank_1_s8, 256, kernel_size=[3, 3], stride=2, scope="decoder_parallel_4", ) bank_1_s16 = slim.conv2d( bank_1_s16, 128, kernel_size=[1, 1], stride=1, scope="decoder_parallel_5", ) with slim.arg_scope( [slim.conv2d_transpose], padding="SAME", normalizer_fn=None, weights_regularizer=slim.l2_regularizer( self.cfg["weight_decay"]), ): with tf.compat.v1.variable_scope("upsampled_features"): concat_3_s16 = tf.concat( [bank_1_s16, bank_2_s16, features], 3) if self.cfg["stride"] == 8: net = concat_3_s16 elif self.cfg["stride"] == 4: upsampled_features_2x = slim.conv2d_transpose( concat_3_s16, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2, scope="block3", ) net = upsampled_features_2x elif self.cfg["stride"] == 2: upsampled_features_2x = slim.conv2d_transpose( concat_3_s16, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2, scope="block3", ) upsampled_features_4x = slim.conv2d_transpose( upsampled_features_2x, self.cfg.get("bank5", 128), kernel_size=[3, 3], stride=2, scope="block4", ) net = upsampled_features_4x out = {} # Attaching multi-stage decoder with tf.compat.v1.variable_scope(scope, reuse=reuse): stage1_hm_out = prediction_layer( self.cfg, net, "part_pred_s1", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) if self.cfg["location_refinement"]: out["locref"] = prediction_layer( self.cfg, net, "locref_pred", self.cfg["num_joints"] * 2) if (self.cfg["pairwise_predict"] and "multi-animal" not in self.cfg["dataset_type"]): out["pairwise_pred"] = prediction_layer( self.cfg, net, "pairwise_pred", self.cfg["num_joints"] * (self.cfg["num_joints"] - 1) * 2, ) if (self.cfg["partaffinityfield_predict"] and "multi-animal" in self.cfg["dataset_type"]): feature = slim.conv2d_transpose(net, self.cfg.get("bank3", 128), kernel_size=[3, 3], stride=2) stage1_paf_out = prediction_layer( self.cfg, net, "pairwise_pred_s1", self.cfg["num_limbs"] * 2) stage2_in = tf.concat( [stage1_hm_out, stage1_paf_out, feature], 3) stage_input = stage2_in stage_paf_output = stage1_paf_out stage_hm_output = stage1_hm_out for i in range(2, 5): pre_stage_paf_output = stage_paf_output pre_stage_hm_output = stage_hm_output stage_paf_output = prediction_layer_stage( self.cfg, stage_input, f"pairwise_pred_s{i}", self.cfg["num_limbs"] * 2, ) stage_hm_output = prediction_layer_stage( self.cfg, stage_input, f"part_pred_s{i}", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) if i > 2: # stage_paf_output = stage_paf_output + pre_stage_paf_output stage_hm_output = stage_hm_output + pre_stage_hm_output stage_input = tf.concat( [stage_hm_output, stage_paf_output, feature], 3) out["part_pred"] = prediction_layer_stage( self.cfg, stage_input, "part_pred", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) out["pairwise_pred"] = prediction_layer_stage( self.cfg, stage_input, "pairwise_pred", self.cfg["num_limbs"] * 2) if self.cfg["intermediate_supervision"]: interm_name = layer_name.format( 3, self.cfg["intermediate_supervision_layer"]) block_interm_out = end_points[interm_name] out["part_pred_interm"] = prediction_layer( self.cfg, block_interm_out, "intermediate_supervision", self.cfg["num_joints"] + self.cfg.get("num_idchannel", 0), ) else: # dual fusion net (for stride 4 experiments) if "resnet" in net_type: num_layers = re.findall("resnet_([0-9]*)", net_type)[0] layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1" mid_pt = layer_name.format(num_layers, 2, 3) elif "mobilenet" in net_type: mid_pt = "layer_7" elif "efficientnet" in net_type: mid_pt = f"block_{parallel_layers[net_type.split('-')[1]]}" else: raise ValueError(f"Unknown network of type {net_type}") final_dims = tf.math.ceil( tf.divide(input_shape[1:3], tf.convert_to_tensor(value=16))) interim_dims = tf.scalar_mul(2, final_dims) interim_dims = tf.cast(interim_dims, tf.int32) bank_3 = end_points[mid_pt] bank_3 = tf.image.resize(bank_3, interim_dims) with slim.arg_scope( [slim.conv2d], padding="SAME", normalizer_fn=None, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (self.cfg['weight_decay'])), ): with tf.compat.v1.variable_scope("decoder_filters"): bank_3 = slim.conv2d(bank_3, self.cfg.get('bank3', 128), 1, scope="decoder_parallel_1") with slim.arg_scope( [slim.conv2d_transpose], padding="SAME", normalizer_fn=None, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (self.cfg['weight_decay'])), ): with tf.compat.v1.variable_scope("upsampled_features"): upsampled_features = slim.conv2d_transpose( features, self.cfg.get('bank5', 128), kernel_size=[3, 3], stride=2, scope="block4", ) net = tf.concat([bank_3, upsampled_features], 3) out = super(PoseMultiNet, self).prediction_layers( net, scope, reuse, ) with tf.compat.v1.variable_scope(scope, reuse=reuse): if self.cfg[ 'intermediate_supervision'] and "efficientnet" not in net_type: if "mobilenet" in net_type: feat = end_points[ f"layer_{self.cfg['intermediate_supervision_layer']}"] elif "resnet" in net_type: layer_name = "resnet_v1_{}/block{}/unit_{}/bottleneck_v1" num_layers = re.findall("resnet_([0-9]*)", net_type)[0] interm_name = layer_name.format( num_layers, 3, self.cfg['intermediate_supervision_layer']) feat = end_points[interm_name] else: return out pred_layer = out["part_pred_interm"] = prediction_layer( self.cfg, feat, "intermediate_supervision", self.cfg['num_joints'] + self.cfg.get("num_idchannel", 0), ) out["part_pred_interm"] = pred_layer return out
def rnn_depth_net_encoderlstm_wpose(current_input, hidden_state, is_training=True): batch_norm_params = {'is_training': is_training, 'decay': 0.99} H = current_input.get_shape()[1] W = current_input.get_shape()[2] with tf.compat.v1.variable_scope('rnn_depth_net', reuse=tf.compat.v1.AUTO_REUSE) as sc: with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=slim.batch_norm, normalizer_params=batch_norm_params, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (0.005)), activation_fn=tf.nn.leaky_relu): cnv1 = slim.conv2d(current_input, 32, [3, 3], stride=2, scope='cnv1') cnv1b, hidden1 = convLSTM(cnv1, hidden_state[0], 32, [3, 3], scope='cnv1_lstm') #cnv1b = slim.conv2d(cnv1, 32, [3, 3], rate=2, stride=1, scope='cnv1b') cnv2 = slim.conv2d(cnv1b, 64, [3, 3], stride=2, scope='cnv2') cnv2b, hidden2 = convLSTM(cnv2, hidden_state[1], 64, [3, 3], scope='cnv2_lstm') #cnv2b = slim.conv2d(cnv2, 64, [3, 3], rate=2, stride=1, scope='cnv2b') cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3') cnv3b, hidden3 = convLSTM(cnv3, hidden_state[2], 128, [3, 3], scope='cnv3_lstm') #cnv3b = slim.conv2d(cnv3, 128, [3, 3], rate=2, stride=1, scope='cnv3b') cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4') cnv4b, hidden4 = convLSTM(cnv4, hidden_state[3], 256, [3, 3], scope='cnv4_lstm') #cnv4b = slim.conv2d(cnv4, 256, [3, 3], rate=2, stride=1, scope='cnv4b') cnv5 = slim.conv2d(cnv4b, 256, [3, 3], stride=2, scope='cnv5') cnv5b, hidden5 = convLSTM(cnv5, hidden_state[4], 256, [3, 3], scope='cnv5_lstm') #cnv5b = slim.conv2d(cnv5, 256, [3, 3], rate=2, stride=1, scope='cnv5b') cnv6 = slim.conv2d(cnv5b, 256, [3, 3], stride=2, scope='cnv6') cnv6b, hidden6 = convLSTM(cnv6, hidden_state[5], 256, [3, 3], scope='cnv6_lstm') #cnv6b = slim.conv2d(cnv6, 256, [3, 3], rate=2, stride=1, scope='cnv6b') cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7') cnv7b, hidden7 = convLSTM(cnv7, hidden_state[6], 512, [3, 3], scope='cnv7_lstm') #cnv7b = slim.conv2d(cnv7, 512, [3, 3], rate=2, stride=1, scope='cnv7b') with tf.compat.v1.variable_scope('pose'): pose_pred = slim.conv2d(cnv7b, 6, 1, 1, normalizer_fn=None, activation_fn=None) pose_avg = tf.reduce_mean(input_tensor=pose_pred, axis=[1, 2]) pose_final = tf.reshape(pose_avg, [-1, 6]) * 0.01 upcnv7 = slim.conv2d_transpose(cnv7b, 256, [3, 3], stride=2, scope='upcnv7') # There might be dimension mismatch due to uneven down/up-sampling upcnv7 = resize_like(upcnv7, cnv6b) i7_in = tf.concat([upcnv7, cnv6b], axis=3) icnv7 = slim.conv2d(i7_in, 256, [3, 3], stride=1, scope='icnv7') #icnv7, hidden8= convLSTM(i7_in, hidden_state[7], 256, [3, 3], scope='icnv7_lstm') upcnv6 = slim.conv2d_transpose(icnv7, 128, [3, 3], stride=2, scope='upcnv6') upcnv6 = resize_like(upcnv6, cnv5b) i6_in = tf.concat([upcnv6, cnv5b], axis=3) icnv6 = slim.conv2d(i6_in, 128, [3, 3], stride=1, scope='icnv6') #icnv6, hidden9= convLSTM(i6_in, hidden_state[8], 128, [3, 3], scope='icnv6_lstm') upcnv5 = slim.conv2d_transpose(icnv6, 128, [3, 3], stride=2, scope='upcnv5') upcnv5 = resize_like(upcnv5, cnv4b) i5_in = tf.concat([upcnv5, cnv4b], axis=3) icnv5 = slim.conv2d(i5_in, 128, [3, 3], stride=1, scope='icnv5') #icnv5, hidden10 = convLSTM(i5_in, hidden_state[9], 128, [3, 3], scope='icnv5_lstm') upcnv4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4') upcnv4 = resize_like(upcnv4, cnv3b) i4_in = tf.concat([upcnv4, cnv3b], axis=3) icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4') #icnv4, hidden11 = convLSTM(i4_in, hidden_state[10], 128, [3, 3], scope='icnv4_lstm') upcnv3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3') upcnv3 = resize_like(upcnv3, cnv2b) i3_in = tf.concat([upcnv3, cnv2b], axis=3) icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3') #icnv3, hidden12 = convLSTM(i3_in, hidden_state[11], 64, [3, 3], scope='icnv3_lstm') upcnv2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2') upcnv2 = resize_like(upcnv2, cnv1b) i2_in = tf.concat([upcnv2, cnv1b], axis=3) icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2') #icnv2, hidden13 = convLSTM(i2_in, hidden_state[12], 32, [3, 3], scope='icnv2_lstm') #import pdb;pdb.set_trace() upcnv1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1') #icnv1, hidden14 = convLSTM(upcnv1, hidden_state[13], 16, [3, 3], scope='icnv1_lstm') icnv1 = slim.conv2d(upcnv1, 16, [3, 3], stride=1, scope='icnv1') depth = slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp1') + 0.01 return depth, pose_final, [ hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7 ]
def build_net(self): def up_conv_block(input, output_dim, base_dim): block = slim.conv2d(input, base_dim, [1, 1]) block = slim.convolution2d_transpose(block, base_dim, [3, 3], stride=2) block = slim.conv2d(block, output_dim, [1, 1]) return block settings = Settings() dim_x, dim_y = settings.dim_x, settings.dim_y self.input_x = tf.placeholder(tf.float32, [None, dim_x, dim_y, 3], 'input_x') self.input_y = tf.placeholder(tf.float32, [None, dim_x, dim_y], 'input_y') input_y_exp = tf.expand_dims(self.input_y, -1) if settings.pretrained_model == 'VGG16': with slim.arg_scope(vgg.vgg_arg_scope()): net, endpoints = vgg.vgg_16(self.input_x, global_pool=False, is_training=False, spatial_squeeze=False) elif settings.pretrained_model == 'RES50': with slim.arg_scope(resnet_v1.resnet_arg_scope()): net, endpoints = resnet_v1.resnet_v1_50(self.input_x, global_pool=False, is_training=False) else: raise ValueError('pretrained model type {} not recognised.'.format( settings.pretrained_model)) pretrained_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ''' with tf.variable_scope('dummy_out'): dummy_loss = tf.reduce_mean(net) print_endpoints(endpoints, 'endpoints.txt') ''' with tf.variable_scope('bridge'): bridge_output = [] net = slim.conv2d(net, 512, [3, 3]) bridge_output.append(net) net = tf.layers.conv2d(net, 512, [3, 3], dilation_rate=(2, 2), activation=tf.nn.relu, padding='same') bridge_output.append(net) net = tf.layers.conv2d(net, 512, [3, 3], dilation_rate=(4, 4), activation=tf.nn.relu, padding='same') bridge_output.append(net) net = tf.layers.conv2d(net, 512, [3, 3], dilation_rate=(8, 8), activation=tf.nn.relu, padding='same') bridge_output.append(net) net = tf.add_n(bridge_output) # print('bridge output') # print(net) with tf.variable_scope('decoder'): net = up_conv_block(net, 1024, 256) bridged = endpoints['resnet_v1_50/block3/unit_5/bottleneck_v1'] net = net + bridged # 64, 64, 1024 net = up_conv_block(net, 512, 128) bridged = endpoints['resnet_v1_50/block2/unit_3/bottleneck_v1'] net = net + bridged # 128, 128, 512 net = up_conv_block(net, 256, 64) bridged = endpoints['resnet_v1_50/block1/unit_2/bottleneck_v1'] net = net + bridged # 256, 256, 256 net = up_conv_block(net, 64, 16) # 512, 512, 64 net = slim.conv2d_transpose(net, 32, [4, 4], stride=2) # 1024, 1024, 32 net = slim.conv2d(net, 1, [3, 3], activation_fn=None) with tf.variable_scope('metrics'): self.output = net self.pred = tf.nn.sigmoid(net) self.bce_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=input_y_exp, logits=self.output)) pred_flat = tf.layers.flatten(self.pred) y_flat = tf.layers.flatten(input_y_exp) self.dice_coeff = 2 * tf.reduce_sum(pred_flat * y_flat) / \ (tf.reduce_sum(pred_flat) + tf.reduce_sum(y_flat)) self.dice_loss = 1 - self.dice_coeff w1, w2 = settings.bce_dice_weights self.dice_bce_loss = w1 * self.bce_loss + w2 * self.dice_loss self.bin_pred = tf.cast(self.pred > settings.threshold, tf.int32) input_y_bin = tf.cast(input_y_exp > settings.threshold, tf.int32) self.debug_x_sum = tf.reduce_sum(self.pred) self.debug_x_bin_sum = tf.reduce_sum(self.bin_pred) self.debug_y_sum = tf.reduce_sum(input_y_exp) self.iou = tf.reduce_sum(self.bin_pred * input_y_bin) / \ (tf.reduce_sum( tf.cast((self.bin_pred + input_y_bin) > 0, tf.int32) )) tf.summary.scalar('bce_loss', self.bce_loss) tf.summary.scalar('dice_coeff', self.dice_coeff) tf.summary.scalar('bce_dice_loss', self.dice_bce_loss) # Regularization self.l2_loss = tf.nn.l2_loss(settings.l2_weight) # self.l2_loss = tf.contrib.layers.apply_regularization( # regularizer=tf.contrib.layers.l2_regularizer(settings.l2_weight), # weights_list=tf.trainable_variables() # ) self.dice_bce_l2_loss = self.dice_bce_loss + self.l2_loss tf.summary.scalar('l2_loss', self.l2_loss) tf.summary.scalar('dice_bce_l2_loss', self.dice_bce_l2_loss) self.pretrained_variables = pretrained_variables self.pretrained_endpoints = endpoints self.trainable_variables = [] self.trainable_variables.extend( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='bridge')) self.trainable_variables.extend( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='decoder'))
def generator(self, inputs, reuse=False, scope='g_net'): n, h, w, c = inputs.get_shape().as_list() if self.args.model == 'lstm': with tf.compat.v1.variable_scope('LSTM'): cell = BasicConvLSTMCell([h / 4, w / 4], [3, 3], 128) rnn_state = cell.zero_state(batch_size=self.batch_size, dtype=tf.float32) x_unwrap = [] with tf.compat.v1.variable_scope(scope, reuse=reuse): with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], activation_fn=tf.nn.relu, padding='SAME', normalizer_fn=None, weights_initializer=tf.keras.initializers.GlorotUniform(), biases_initializer=tf.constant_initializer(0.0)): inp_pred = inputs for i in xrange(self.n_levels): scale = self.scale**(self.n_levels - i - 1) hi = int(round(h * scale)) wi = int(round(w * scale)) inp_blur = tf.image.resize(inputs, [hi, wi]) inp_pred = tf.stop_gradient( tf.image.resize(inp_pred, [hi, wi])) inp_all = tf.concat([inp_blur, inp_pred], axis=3, name='inp') if self.args.model == 'lstm': rnn_state = tf.image.resize(rnn_state, [hi // 4, wi // 4]) # encoder conv1_1 = slim.conv2d(inp_all, 32, [5, 5], scope='enc1_1') conv1_2 = ResnetBlock(conv1_1, 32, 5, scope='enc1_2') conv1_3 = ResnetBlock(conv1_2, 32, 5, scope='enc1_3') conv1_4 = ResnetBlock(conv1_3, 32, 5, scope='enc1_4') conv2_1 = slim.conv2d(conv1_4, 64, [5, 5], stride=2, scope='enc2_1') conv2_2 = ResnetBlock(conv2_1, 64, 5, scope='enc2_2') conv2_3 = ResnetBlock(conv2_2, 64, 5, scope='enc2_3') conv2_4 = ResnetBlock(conv2_3, 64, 5, scope='enc2_4') conv3_1 = slim.conv2d(conv2_4, 128, [5, 5], stride=2, scope='enc3_1') conv3_2 = ResnetBlock(conv3_1, 128, 5, scope='enc3_2') conv3_3 = ResnetBlock(conv3_2, 128, 5, scope='enc3_3') conv3_4 = ResnetBlock(conv3_3, 128, 5, scope='enc3_4') if self.args.model == 'lstm': deconv3_4, rnn_state = cell(conv3_4, rnn_state) else: deconv3_4 = conv3_4 # decoder deconv3_3 = ResnetBlock(deconv3_4, 128, 5, scope='dec3_3') deconv3_2 = ResnetBlock(deconv3_3, 128, 5, scope='dec3_2') deconv3_1 = ResnetBlock(deconv3_2, 128, 5, scope='dec3_1') deconv2_4 = slim.conv2d_transpose(deconv3_1, 64, [4, 4], stride=2, scope='dec2_4') cat2 = deconv2_4 + conv2_4 deconv2_3 = ResnetBlock(cat2, 64, 5, scope='dec2_3') deconv2_2 = ResnetBlock(deconv2_3, 64, 5, scope='dec2_2') deconv2_1 = ResnetBlock(deconv2_2, 64, 5, scope='dec2_1') deconv1_4 = slim.conv2d_transpose(deconv2_1, 32, [4, 4], stride=2, scope='dec1_4') cat1 = deconv1_4 + conv1_4 deconv1_3 = ResnetBlock(cat1, 32, 5, scope='dec1_3') deconv1_2 = ResnetBlock(deconv1_3, 32, 5, scope='dec1_2') deconv1_1 = ResnetBlock(deconv1_2, 32, 5, scope='dec1_1') inp_pred = slim.conv2d(deconv1_1, self.chns, [5, 5], activation_fn=None, scope='dec1_0') if i >= 0: x_unwrap.append(inp_pred) if i == 0: tf.compat.v1.get_variable_scope().reuse_variables() return x_unwrap
def generator(inputs, depth=64, final_size=32, num_outputs=3, is_training=True, reuse=None, scope='Generator', fused_batch_norm=False): """Generator network for DCGAN. Construct generator network from inputs to the final endpoint. Args: inputs: A tensor with any size N. [batch_size, N] depth: Number of channels in last deconvolution layer. final_size: The shape of the final output. num_outputs: Number of output features. For images, this is the number of channels. is_training: whether is training or not. reuse: Whether or not the network has its variables should be reused. scope must be given to be reused. scope: Optional variable_scope. fused_batch_norm: If `True`, use a faster, fused implementation of batch norm. Returns: logits: the pre-softmax activations, a tensor of size [batch_size, 32, 32, channels] end_points: a dictionary from components of the network to their activation. Raises: ValueError: If `inputs` is not 2-dimensional. ValueError: If `final_size` isn't a power of 2 or is less than 8. """ normalizer_fn = slim.batch_norm normalizer_fn_args = { 'is_training': is_training, 'zero_debias_moving_mean': True, 'fused': fused_batch_norm, } inputs.get_shape().assert_has_rank(2) if log(final_size, 2) != int(log(final_size, 2)): raise ValueError('`final_size` (%i) must be a power of 2.' % final_size) if final_size < 8: raise ValueError('`final_size` (%i) must be greater than 8.' % final_size) end_points = {} num_layers = int(log(final_size, 2)) - 1 with tf.compat.v1.variable_scope(scope, values=[inputs], reuse=reuse) as scope: with slim.arg_scope([normalizer_fn], **normalizer_fn_args): with slim.arg_scope([slim.conv2d_transpose], normalizer_fn=normalizer_fn, stride=2, kernel_size=4): net = tf.expand_dims(tf.expand_dims(inputs, 1), 1) # First upscaling is different because it takes the input vector. current_depth = depth * 2**(num_layers - 1) scope = 'deconv1' net = slim.conv2d_transpose(net, current_depth, stride=1, padding='VALID', scope=scope) end_points[scope] = net for i in xrange(2, num_layers): scope = 'deconv%i' % (i) current_depth = depth * 2**(num_layers - i) net = slim.conv2d_transpose(net, current_depth, scope=scope) end_points[scope] = net # Last layer has different normalizer and activation. scope = 'deconv%i' % (num_layers) net = slim.conv2d_transpose(net, depth, normalizer_fn=None, activation_fn=None, scope=scope) end_points[scope] = net # Convert to proper channels. scope = 'logits' logits = slim.conv2d(net, num_outputs, normalizer_fn=None, activation_fn=None, kernel_size=1, stride=1, padding='VALID', scope=scope) end_points[scope] = logits logits.get_shape().assert_has_rank(4) logits.get_shape().assert_is_compatible_with( [None, final_size, final_size, num_outputs]) return logits, end_points
def deconv(self, x, num_out_layers, kernel_size, scale): p_x = tf.pad(x, [[0, 0], [1, 1], [1, 1], [0, 0]]) conv = slim.conv2d_transpose(p_x, num_out_layers, kernel_size, scale, 'SAME') return conv[:,3:-1,3:-1,:]
def cyclegan_upsample(net, num_outputs, stride, method='conv2d_transpose', pad_mode='REFLECT', align_corners=False): """Upsamples the given inputs. Args: net: A Tensor of size [batch_size, height, width, filters]. num_outputs: The number of output filters. stride: A list of 2 scalars or a 1x2 Tensor indicating the scale, relative to the inputs, of the output dimensions. For example, if kernel size is [2, 3], then the output height and width will be twice and three times the input size. method: The upsampling method: 'nn_upsample_conv', 'bilinear_upsample_conv', or 'conv2d_transpose'. pad_mode: mode for tf.pad, one of "CONSTANT", "REFLECT", or "SYMMETRIC". align_corners: option for method, 'bilinear_upsample_conv'. If true, the centers of the 4 corner pixels of the input and output tensors are aligned, preserving the values at the corner pixels. Returns: A Tensor which was upsampled using the specified method. Raises: ValueError: if `method` is not recognized. """ with tf.variable_scope('upconv'): net_shape = tf.shape(input=net) height = net_shape[1] width = net_shape[2] # Reflection pad by 1 in spatial dimensions (axes 1, 2 = h, w) to make a 3x3 # 'valid' convolution produce an output with the same dimension as the # input. spatial_pad_1 = np.array([[0, 0], [1, 1], [1, 1], [0, 0]]) if method == 'nn_upsample_conv': net = tf.image.resize( net, [stride[0] * height, stride[1] * width], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) net = tf.pad(tensor=net, paddings=spatial_pad_1, mode=pad_mode) net = slim.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid') elif method == 'bilinear_upsample_conv': net = tf.image.resize_bilinear( net, [stride[0] * height, stride[1] * width], align_corners=align_corners) net = tf.pad(tensor=net, paddings=spatial_pad_1, mode=pad_mode) net = slim.conv2d(net, num_outputs, kernel_size=[3, 3], padding='valid') elif method == 'conv2d_transpose': # This corrects 1 pixel offset for images with even width and height. # conv2d is left aligned and conv2d_transpose is right aligned for even # sized images (while doing 'SAME' padding). # Note: This doesn't reflect actual model in paper. net = slim.conv2d_transpose(net, num_outputs, kernel_size=[3, 3], stride=stride, padding='valid') net = net[:, 1:, 1:, :] else: raise ValueError('Unknown method: [%s]' % method) return net
def deconv(x, *args, pad=1, **kwargs): with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], padding='VALID'): x = padding(x, pad) return slim.conv2d_transpose(x, *args, **kwargs)
def pose_exp_net(tgt_image, src_image_stack, do_exp=True, is_training=True): inputs = tf.concat([tgt_image, src_image_stack], axis=3) H = inputs.get_shape()[1].value W = inputs.get_shape()[2].value num_source = int(src_image_stack.get_shape()[3].value // 3) with tf.variable_scope('pose_exp_net') as sc: end_points = sc.original_name_scope + '_end_points' with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=None, weights_regularizer=slim.l2_regularizer(0.05), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): # cnv1 to cnv5b are shared between pose and explainability prediction cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1') cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2') cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3') cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4') cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5') # Pose specific layers with tf.variable_scope('pose'): cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6') cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7') pose_pred = slim.conv2d(cnv7, 6 * num_source, [1, 1], scope='pred', stride=1, normalizer_fn=None, activation_fn=None) pose_avg = tf.reduce_mean(pose_pred, [1, 2]) # Empirically we found that scaling by a small constant # facilitates training. pose_final = 0.01 * tf.reshape(pose_avg, [-1, num_source, 6]) # Exp mask specific layers if do_exp: with tf.variable_scope('exp'): upcnv5 = slim.conv2d_transpose(cnv5, 256, [3, 3], stride=2, scope='upcnv5') upcnv4 = slim.conv2d_transpose(upcnv5, 128, [3, 3], stride=2, scope='upcnv4') mask4 = slim.conv2d(upcnv4, num_source * 2, [3, 3], stride=1, scope='mask4', normalizer_fn=None, activation_fn=None) upcnv3 = slim.conv2d_transpose(upcnv4, 64, [3, 3], stride=2, scope='upcnv3') mask3 = slim.conv2d(upcnv3, num_source * 2, [3, 3], stride=1, scope='mask3', normalizer_fn=None, activation_fn=None) upcnv2 = slim.conv2d_transpose(upcnv3, 32, [5, 5], stride=2, scope='upcnv2') mask2 = slim.conv2d(upcnv2, num_source * 2, [5, 5], stride=1, scope='mask2', normalizer_fn=None, activation_fn=None) upcnv1 = slim.conv2d_transpose(upcnv2, 16, [7, 7], stride=2, scope='upcnv1') mask1 = slim.conv2d(upcnv1, num_source * 2, [7, 7], stride=1, scope='mask1', normalizer_fn=None, activation_fn=None) else: mask1 = None mask2 = None mask3 = None mask4 = None # I erased the old conversion using collection_to_dict return pose_final, [mask1, mask2, mask3, mask4], end_points
def run(self, inputs, trainable=True): """Runs model.""" _, height, width, _ = inputs["input_a"].shape.as_list() with tf.variable_scope("FlowNetSD"): concat_inputs = tf.concat([inputs["input_a"], inputs["input_b"]], axis=3) with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], # Only backprop this network if trainable. trainable=trainable, # He (aka MSRA) weight initialization. weights_initializer=slim.variance_scaling_initializer(), activation_fn=leaky_relu, # We will do our own padding to match the original Caffe code. padding="VALID"): weights_regularizer = slim.l2_regularizer(WEIGHT_DECAY) with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): conv0 = slim.conv2d(pad(concat_inputs), 64, 3, scope="conv0") conv1 = slim.conv2d(pad(conv0), 64, 3, stride=2, scope="conv1") conv1_1 = slim.conv2d(pad(conv1), 128, 3, scope="conv1_1") conv2 = slim.conv2d(pad(conv1_1), 128, 3, stride=2, scope="conv2") conv2_1 = slim.conv2d(pad(conv2), 128, 3, scope="conv2_1") conv3 = slim.conv2d(pad(conv2_1), 256, 3, stride=2, scope="conv3") conv3_1 = slim.conv2d(pad(conv3), 256, 3, scope="conv3_1") conv4 = slim.conv2d(pad(conv3_1), 512, 3, stride=2, scope="conv4") conv4_1 = slim.conv2d(pad(conv4), 512, 3, scope="conv4_1") conv5 = slim.conv2d(pad(conv4_1), 512, 3, stride=2, scope="conv5") conv5_1 = slim.conv2d(pad(conv5), 512, 3, scope="conv5_1") conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope="conv6") conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope="conv6_1") # START: Refinement Network. with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, scope="predict_flow6", activation_fn=None) deconv5 = antipad( slim.conv2d_transpose(conv6_1, 512, 4, stride=2, scope="deconv5")) upsample_flow6to5 = antipad( slim.conv2d_transpose(predict_flow6, 2, 4, stride=2, scope="upsample_flow6to5", activation_fn=None)) concat5 = tf.concat( [conv5_1, deconv5, upsample_flow6to5], axis=3) interconv5 = slim.conv2d(pad(concat5), 512, 3, activation_fn=None, scope="interconv5") predict_flow5 = slim.conv2d(pad(interconv5), 2, 3, scope="predict_flow5", activation_fn=None) deconv4 = antipad( slim.conv2d_transpose(concat5, 256, 4, stride=2, scope="deconv4")) upsample_flow5to4 = antipad( slim.conv2d_transpose(predict_flow5, 2, 4, stride=2, scope="upsample_flow5to4", activation_fn=None)) concat4 = tf.concat( [conv4_1, deconv4, upsample_flow5to4], axis=3) interconv4 = slim.conv2d(pad(concat4), 256, 3, activation_fn=None, scope="interconv4") predict_flow4 = slim.conv2d(pad(interconv4), 2, 3, scope="predict_flow4", activation_fn=None) deconv3 = antipad( slim.conv2d_transpose(concat4, 128, 4, stride=2, scope="deconv3")) upsample_flow4to3 = antipad( slim.conv2d_transpose(predict_flow4, 2, 4, stride=2, scope="upsample_flow4to3", activation_fn=None)) concat3 = tf.concat( [conv3_1, deconv3, upsample_flow4to3], axis=3) interconv3 = slim.conv2d(pad(concat3), 128, 3, activation_fn=None, scope="interconv3") predict_flow3 = slim.conv2d(pad(interconv3), 2, 3, scope="predict_flow3", activation_fn=None) deconv2 = antipad( slim.conv2d_transpose(concat3, 64, 4, stride=2, scope="deconv2")) upsample_flow3to2 = antipad( slim.conv2d_transpose(predict_flow3, 2, 4, stride=2, scope="upsample_flow3to2", activation_fn=None)) concat2 = tf.concat( [conv2, deconv2, upsample_flow3to2], axis=3) interconv2 = slim.conv2d(pad(concat2), 64, 3, activation_fn=None, scope="interconv2") predict_flow2 = slim.conv2d(pad(interconv2), 2, 3, scope="predict_flow2", activation_fn=None) # END: Refinement Network. flow = predict_flow2 * 0.05 flow = tf.image.resize_bilinear(flow, tf.stack([height, width]), align_corners=True) return { "predict_flow6": predict_flow6, "predict_flow5": predict_flow5, "predict_flow4": predict_flow4, "predict_flow3": predict_flow3, "predict_flow2": predict_flow2, "flow": flow, }
def run(self, inputs, trainable=True): """Runs model.""" _, height, width, _ = inputs["input_a"].shape.as_list() with tf.variable_scope("FlowNet2"): # Forward pass through FlowNetCSS and FlowNetSD with weights frozen. net_css_predictions = self.net_css.run(inputs, trainable=False) net_sd_predictions = self.net_sd.run(inputs, trainable=False) sd_flow_norm = channel_norm(net_sd_predictions["flow"]) css_flow_norm = channel_norm(net_css_predictions["flow"]) flow_warp_sd = flow_warp(inputs["input_b"], net_sd_predictions["flow"]) img_diff_sd = inputs["input_a"] - flow_warp_sd img_diff_sd_norm = channel_norm(img_diff_sd) flow_warp_css = flow_warp(inputs["input_b"], net_css_predictions["flow"]) img_diff_css = inputs["input_a"] - flow_warp_css img_diff_css_norm = channel_norm(img_diff_css) input_to_fusion = tf.concat([ inputs["input_a"], net_sd_predictions["flow"], net_css_predictions["flow"], sd_flow_norm, css_flow_norm, img_diff_sd_norm, img_diff_css_norm ], axis=3) # Fusion Network. with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], # Only backprop this network if trainable. trainable=trainable, # He (aka MSRA) weight initialization. weights_initializer=slim.variance_scaling_initializer(), activation_fn=leaky_relu, # We will do our own padding to match the original Caffe code. padding="VALID"): weights_regularizer = slim.l2_regularizer(WEIGHT_DECAY) with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): fuse_conv0 = slim.conv2d(pad(input_to_fusion), 64, 3, scope="fuse_conv0") fuse_conv1 = slim.conv2d(pad(fuse_conv0), 64, 3, stride=2, scope="fuse_conv1") fuse_conv1_1 = slim.conv2d(pad(fuse_conv1), 128, 3, scope="fuse_conv1_1") fuse_conv2 = slim.conv2d(pad(fuse_conv1_1), 128, 3, stride=2, scope="fuse_conv2") fuse_conv2_1 = slim.conv2d(pad(fuse_conv2), 128, 3, scope="fuse_conv2_1") predict_flow2 = slim.conv2d(pad(fuse_conv2_1), 2, 3, scope="predict_flow2", activation_fn=None) fuse_deconv1 = antipad( slim.conv2d_transpose(fuse_conv2_1, 32, 4, stride=2, scope="fuse_deconv1")) fuse_upsample_flow2to1 = antipad( slim.conv2d_transpose(predict_flow2, 2, 4, stride=2, scope="fuse_upsample_flow2to1", activation_fn=None)) concat1 = tf.concat( [fuse_conv1_1, fuse_deconv1, fuse_upsample_flow2to1], axis=3) fuse_interconv1 = slim.conv2d(pad(concat1), 32, 3, activation_fn=None, scope="fuse_interconv1") predict_flow1 = slim.conv2d(pad(fuse_interconv1), 2, 3, scope="predict_flow1", activation_fn=None) fuse_deconv0 = antipad( slim.conv2d_transpose(concat1, 16, 4, stride=2, scope="fuse_deconv0")) fuse_upsample_flow1to0 = antipad( slim.conv2d_transpose(predict_flow1, 2, 4, stride=2, scope="fuse_upsample_flow1to0", activation_fn=None)) concat0 = tf.concat( [fuse_conv0, fuse_deconv0, fuse_upsample_flow1to0], axis=3) fuse_interconv0 = slim.conv2d(pad(concat0), 16, 3, activation_fn=None, scope="fuse_interconv0") predict_flow0 = slim.conv2d(pad(fuse_interconv0), 2, 3, activation_fn=None, scope="predict_flow0") flow = tf.image.resize_bilinear(predict_flow0, tf.stack([height, width]), align_corners=True) return { "predict_flow0": predict_flow0, "flow": flow, }
def disp_net(target_image, is_training=True): """Predict inverse of depth from a single image.""" batch_norm_params = {'is_training': is_training} h = target_image.get_shape()[1] w = target_image.get_shape()[2] inputs = target_image with tf.compat.v1.variable_scope('depth_net') as sc: end_points_collection = sc.original_name_scope + '_end_points' normalizer_fn = slim.batch_norm if FLAGS.use_bn else None normalizer_params = batch_norm_params if FLAGS.use_bn else None with slim.arg_scope([slim.conv2d, slim.conv2d_transpose], normalizer_fn=normalizer_fn, normalizer_params=normalizer_params, weights_regularizer=tf.keras.regularizers.l2( 0.5 * (WEIGHT_REG)), activation_fn=tf.nn.relu, outputs_collections=end_points_collection): cnv1 = slim.conv2d(inputs, 32, [7, 7], stride=2, scope='cnv1') cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b') cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2') cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b') cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3') cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b') cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4') cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b') cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5') cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b') cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6') cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b') cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7') cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b') up7 = slim.conv2d_transpose(cnv7b, 512, [3, 3], stride=2, scope='upcnv7') # There might be dimension mismatch due to uneven down/up-sampling. up7 = _resize_like(up7, cnv6b) i7_in = tf.concat([up7, cnv6b], axis=3) icnv7 = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7') up6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6') up6 = _resize_like(up6, cnv5b) i6_in = tf.concat([up6, cnv5b], axis=3) icnv6 = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6') up5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5') up5 = _resize_like(up5, cnv4b) i5_in = tf.concat([up5, cnv4b], axis=3) icnv5 = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5') up4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4') i4_in = tf.concat([up4, cnv3b], axis=3) icnv4 = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4') disp4 = (slim.conv2d(icnv4, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp4') * DISP_SCALING + MIN_DISP) disp4_up = tf.image.resize( disp4, [np.int(h / 4), np.int(w / 4)], method=tf.image.ResizeMethod.BILINEAR) up3 = slim.conv2d_transpose(icnv4, 64, [3, 3], stride=2, scope='upcnv3') i3_in = tf.concat([up3, cnv2b, disp4_up], axis=3) icnv3 = slim.conv2d(i3_in, 64, [3, 3], stride=1, scope='icnv3') disp3 = (slim.conv2d(icnv3, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp3') * DISP_SCALING + MIN_DISP) disp3_up = tf.image.resize( disp3, [np.int(h / 2), np.int(w / 2)], method=tf.image.ResizeMethod.BILINEAR) up2 = slim.conv2d_transpose(icnv3, 32, [3, 3], stride=2, scope='upcnv2') i2_in = tf.concat([up2, cnv1b, disp3_up], axis=3) icnv2 = slim.conv2d(i2_in, 32, [3, 3], stride=1, scope='icnv2') disp2 = (slim.conv2d(icnv2, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp2') * DISP_SCALING + MIN_DISP) disp2_up = tf.image.resize(disp2, [h, w], method=tf.image.ResizeMethod.BILINEAR) up1 = slim.conv2d_transpose(icnv2, 16, [3, 3], stride=2, scope='upcnv1') i1_in = tf.concat([up1, disp2_up], axis=3) icnv1 = slim.conv2d(i1_in, 16, [3, 3], stride=1, scope='icnv1') disp1 = (slim.conv2d(icnv1, 1, [3, 3], stride=1, activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp1') * DISP_SCALING + MIN_DISP) end_points = slim.utils.convert_collection_to_dict( end_points_collection) return [disp1, disp2, disp3, disp4], end_points
def run(self, inputs, trainable=True): """Runs model.""" _, height, width, _ = inputs["input_a"].shape.as_list() with tf.variable_scope("FlowNetC"): with slim.arg_scope( [slim.conv2d, slim.conv2d_transpose], # Only backprop this network if trainable. trainable=trainable, # He (aka MSRA) weight initialization. weights_initializer=slim.variance_scaling_initializer(), activation_fn=leaky_relu, # We will do our own padding to match the original Caffe code. padding="VALID"): weights_regularizer = slim.l2_regularizer(WEIGHT_DECAY) with slim.arg_scope([slim.conv2d], weights_regularizer=weights_regularizer): with slim.arg_scope([slim.conv2d], stride=2): conv_a_1 = slim.conv2d(pad(inputs["input_a"], 3), 64, 7, scope="conv1") conv_a_2 = slim.conv2d(pad(conv_a_1, 2), 128, 5, scope="conv2") conv_a_3 = slim.conv2d(pad(conv_a_2, 2), 256, 5, scope="conv3") conv_b_1 = slim.conv2d(pad(inputs["input_b"], 3), 64, 7, scope="conv1", reuse=True) conv_b_2 = slim.conv2d(pad(conv_b_1, 2), 128, 5, scope="conv2", reuse=True) conv_b_3 = slim.conv2d(pad(conv_b_2, 2), 256, 5, scope="conv3", reuse=True) # Compute cross correlation with leaky relu activation. cc = correlation(conv_a_3, conv_b_3, 1, 20, 1, 2, 20) cc_relu = leaky_relu(cc) # Combine cross correlation results with convolution of feature map A. net_a_conv = slim.conv2d(conv_a_3, 32, 1, scope="conv_redir") # Concatenate along the channels axis. net = tf.concat([net_a_conv, cc_relu], axis=3) conv3_1 = slim.conv2d(pad(net), 256, 3, scope="conv3_1") with slim.arg_scope([slim.conv2d], num_outputs=512, kernel_size=3): conv4 = slim.conv2d(pad(conv3_1), stride=2, scope="conv4") conv4_1 = slim.conv2d(pad(conv4), scope="conv4_1") conv5 = slim.conv2d(pad(conv4_1), stride=2, scope="conv5") conv5_1 = slim.conv2d(pad(conv5), scope="conv5_1") conv6 = slim.conv2d(pad(conv5_1), 1024, 3, stride=2, scope="conv6") conv6_1 = slim.conv2d(pad(conv6), 1024, 3, scope="conv6_1") # START: Refinement Network. with slim.arg_scope([slim.conv2d_transpose], biases_initializer=None): predict_flow6 = slim.conv2d(pad(conv6_1), 2, 3, scope="predict_flow6", activation_fn=None) deconv5 = antipad( slim.conv2d_transpose(conv6_1, 512, 4, stride=2, scope="deconv5")) upsample_flow6to5 = antipad( slim.conv2d_transpose(predict_flow6, 2, 4, stride=2, scope="upsample_flow6to5", activation_fn=None)) concat5 = tf.concat( [conv5_1, deconv5, upsample_flow6to5], axis=3) predict_flow5 = slim.conv2d(pad(concat5), 2, 3, scope="predict_flow5", activation_fn=None) deconv4 = antipad( slim.conv2d_transpose(concat5, 256, 4, stride=2, scope="deconv4")) upsample_flow5to4 = antipad( slim.conv2d_transpose(predict_flow5, 2, 4, stride=2, scope="upsample_flow5to4", activation_fn=None)) concat4 = tf.concat( [conv4_1, deconv4, upsample_flow5to4], axis=3) predict_flow4 = slim.conv2d(pad(concat4), 2, 3, scope="predict_flow4", activation_fn=None) deconv3 = antipad( slim.conv2d_transpose(concat4, 128, 4, stride=2, scope="deconv3")) upsample_flow4to3 = antipad( slim.conv2d_transpose(predict_flow4, 2, 4, stride=2, scope="upsample_flow4to3", activation_fn=None)) concat3 = tf.concat( [conv3_1, deconv3, upsample_flow4to3], axis=3) predict_flow3 = slim.conv2d(pad(concat3), 2, 3, scope="predict_flow3", activation_fn=None) deconv2 = antipad( slim.conv2d_transpose(concat3, 64, 4, stride=2, scope="deconv2")) upsample_flow3to2 = antipad( slim.conv2d_transpose(predict_flow3, 2, 4, stride=2, scope="upsample_flow3to2", activation_fn=None)) concat2 = tf.concat( [conv_a_2, deconv2, upsample_flow3to2], axis=3) predict_flow2 = slim.conv2d(pad(concat2), 2, 3, scope="predict_flow2", activation_fn=None) # END: Refinement Network. flow = predict_flow2 * 20.0 flow = tf.image.resize_bilinear(flow, tf.stack([height, width]), align_corners=True) return { "predict_flow6": predict_flow6, "predict_flow5": predict_flow5, "predict_flow4": predict_flow4, "predict_flow3": predict_flow3, "predict_flow2": predict_flow2, "flow": flow, }