def Prediction_network(self): RGB_inputs = tf.keras.Input(shape=(self.inputH, self.inputW, self.channel)) Noise_inputs = tf.keras.Input(shape=(self.inputH, self.inputW, self.channel)) RGB_outputs = self.RGB_net(RGB_inputs) Noise_outputs = self.Noise_net(Noise_inputs) cbp = compact_bilinear_pooling_layer(RGB_outputs, Noise_outputs, 256) cbp_flat = tf.keras.layers.Flatten()(cbp) fc_1 = tf.keras.layers.Dense(256, activation="relu", kernel_initializer='he_normal')(cbp_flat) fc_1_dropout = tf.keras.layers.Dropout(0.5)(fc_1) fc_2 = tf.keras.layers.Dense( 256, activation="relu", kernel_initializer='he_normal')(fc_1_dropout) fc_2_dropout = tf.keras.layers.Dropout(0.5)(fc_2) outputs = tf.keras.layers.Dense(2, activation="softmax")(fc_2_dropout) model = tf.keras.Model(inputs=[RGB_inputs, Noise_inputs], outputs=outputs) return model
def configure_inference_and_loss(inputs, labels, net, arg_scope, num_classes, is_training): with slim.arg_scope(arg_scope): # Load inception_v3 as model 1 with tf.variable_scope('model_1'): _, end_points_1 = nets_factory.networks_map[net]( inputs=inputs, num_classes=num_classes, is_training=is_training) # Load inception_v3 as model 2 with tf.variable_scope('model_2'): _, end_points_2 = nets_factory.networks_map[net]( inputs=inputs, num_classes=num_classes, is_training=is_training) # Add Compact Pooling Layer height, width = end_points_1['Mixed_7c'].get_shape().as_list()[1:3] output_dim = 16000 with tf.variable_scope('Compact_Pooling'): net = compact_bilinear_pooling_layer(end_points_1['Mixed_7c'], end_points_2['Mixed_7c'], output_dim, sum_pool=False) net = tf.reshape(net, [-1, height, width, output_dim]) # Add Logits output with slim.arg_scope(arg_scope): with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with tf.variable_scope('Compact_Pooling/Logits'): net = slim.conv2d(net, 512, [3, 3], padding='SAME', scope='Conv2d_1b_3x3') kernel_size = [height, width] net = slim.avg_pool2d( net, kernel_size, padding='VALID', scope='AvgPool_1a_{}x{}'.format(*kernel_size)) tf.summary.histogram('pre_logits', net) net = slim.dropout(net, scope='Dropout_1b', is_training=is_training) logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') predictions = slim.softmax(logits, scope='Predictions') loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels, label_smoothing=0., weights=1.0) return predictions, loss
def _configure_loss_function(inputs, labels, net, arg_scope, num_classes, dropout_keep_prob, label_smoothing, is_training): with slim.arg_scope(arg_scope): _, end_points = nets_factory.networks_map[net]( inputs=inputs, num_classes=num_classes, dropout_keep_prob=dropout_keep_prob, is_training=is_training ) # Add Compact Pooling Layer height, width = end_points['Mixed_7c'].get_shape().as_list()[1:3] output_dim = 16000 with tf.variable_scope('Compact_Pooling'): net = compact_bilinear_pooling_layer(end_points['Mixed_7c'], end_points['Mixed_7c'], output_dim, sum_pool=False) net = tf.reshape(net, [-1, height, width, output_dim]) # Add Logits output with slim.arg_scope(arg_scope): with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): with tf.variable_scope('Compact_Pooling/Logits'): net = slim.conv2d(net, 512, [3, 3], padding='SAME', scope='Conv2d_1b_3x3') kernel_size = [height, width] net = slim.avg_pool2d(net, kernel_size, padding='VALID', scope='AvgPool_1a_{}x{}'.format(*kernel_size)) tf.summary.histogram('pre_logits', net) net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') # Add loss function tf.losses.softmax_cross_entropy(logits=end_points['AuxLogits'], onehot_labels=labels, label_smoothing=label_smoothing, weights=0.4, scope='aux_loss') tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels, label_smoothing=label_smoothing, weights=1.0)
output[n, ...] = np.outer(bottom1_flat[n], bottom2_flat[n]).reshape(-1) output = output.reshape((batch_size, height, width, output_dim)) if sum_pool: output = np.sum(output, axis=(1, 2)) return output # Input and output tensors # Input channels need to be specified for shape inference input_dim1 = 2048 input_dim2 = 2048 output_dim = 16000 bottom1 = tf.placeholder(tf.float32, [None, None, None, input_dim1]) bottom2 = tf.placeholder(tf.float32, [None, None, None, input_dim2]) top = compact_bilinear_pooling_layer(bottom1, bottom2, output_dim, sum_pool=True) grad = tf.gradients(top, [bottom1, bottom2]) def cbp(bottom1_value, bottom2_value): sess = tf.get_default_session() return sess.run(top, feed_dict={bottom1: bottom1_value, bottom2: bottom2_value}) def cbp_with_grad(bottom1_value, bottom2_value): sess = tf.get_default_session() return sess.run([top] + grad, feed_dict={bottom1: bottom1_value, bottom2: bottom2_value})
def cbp(x1, x2, in_size, dim, size): v = compact_bilinear_pooling_layer(x1, x2, in_size, dim, sum_pool=True) v.set_shape([size, dim]) return v
def vgg_16_cbcnn(input_shape, no_classes, bilinear_output_dim, sum_pool=True, weight_decay_constant=5e-4, multi_label=False, weights_path=None): weights_regularizer = regularizers.l2(weight_decay_constant) # Input layer img_input = Input(shape=input_shape, name='spectr_input') # Block 1 x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv1', kernel_regularizer=weights_regularizer)(img_input) x = Conv2D(64, (3, 3), activation='relu', padding='same', name='block1_conv2', kernel_regularizer=weights_regularizer)(x) x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x) # Block 2 x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv1', kernel_regularizer=weights_regularizer)(x) x = Conv2D(128, (3, 3), activation='relu', padding='same', name='block2_conv2', kernel_regularizer=weights_regularizer)(x) x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x) # Block 3 x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv1', kernel_regularizer=weights_regularizer)(x) x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv2', kernel_regularizer=weights_regularizer)(x) x = Conv2D(256, (3, 3), activation='relu', padding='same', name='block3_conv3', kernel_regularizer=weights_regularizer)(x) x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x) # Block 4 x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv1', kernel_regularizer=weights_regularizer)(x) x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv2', kernel_regularizer=weights_regularizer)(x) x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block4_conv3', kernel_regularizer=weights_regularizer)(x) x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x) # Block 5 x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv1', kernel_regularizer=weights_regularizer)(x) x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv2', kernel_regularizer=weights_regularizer)(x) x = Conv2D(512, (3, 3), activation='relu', padding='same', name='block5_conv3', kernel_regularizer=weights_regularizer)(x) # Merge using compact bilinear method # dummy_tensor_for_output_dim = K.placeholder(shape=(bilinear_output_dim,)) compact_bilinear_arg_list = [x, x] output_shape_x = x.get_shape().as_list()[1:] output_shape_cb = (output_shape_x[0], output_shape_x[1], bilinear_output_dim) cbp = compact_bilinear_pooling_layer(x, x, bilinear_output_dim, sum_pool=True) # If sum_pool=True do a global sum pooling if sum_pool: # Since using tf. Hence 3rd would represent channels x = Lambda(lambda x: K.sum(x, axis=[1, 2]))(cbp) # Sign sqrt and L2 normalize result x = Lambda(lambda x: K.sign(x) * K.sqrt(K.abs(x)))(x) x = Lambda(lambda x: K.l2_normalize(x, axis=-1))(x) # final dense layer if not multi_label: final_activation = 'softmax' else: final_activation = 'sigmoid' x = Dense(no_classes, activation=final_activation, name='softmax_layer', kernel_regularizer=weights_regularizer)(x) # Put together input and output to form model model = Model(inputs=[img_input], outputs=[x]) if weights_path: model.load_weights(weights_path, by_name=True) return model
def init_opt(self): '''Helper function for init_opt''' self.app_encoder = AppEncoder() with tf.device('/cpu:0'): self.g_lr = tf.placeholder(tf.float32, [], name='generator_learning_rate') self.d_lr = tf.placeholder(tf.float32, [], name='discriminator_learning_rate') g_opt = tf.train.AdamOptimizer(self.g_lr, beta1=0.5) d_opt = tf.train.AdamOptimizer(self.d_lr, beta1=0.5) self.models = [] self.num_gpu = 2 for gpu_id in range(self.num_gpu): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('tower_%d' % gpu_id): with tf.variable_scope('cpu_variables', reuse=gpu_id > 0): image = tf.placeholder( tf.float32, [self.batch_size, 224, 224, 3], name='image') text = tf.placeholder(tf.float32, [self.batch_size, 1024], name='text') label = tf.placeholder(tf.float32, [self.batch_size, 200], name="label") with tf.variable_scope("Inception", reuse=gpu_id > 0): image_app_7x7, image_app =\ self.app_encoder.build(image) image_app = tf.tile(image_app, [1, 7, 7, 1]) image_app_cbp = tf.reshape( cbp.compact_bilinear_pooling_layer( image_app_7x7, image_app, 800, sum_pool=False, sequential=False), (self.batch_size, 7, 7, 800)) image_app_cbp /= tf.reduce_max(image_app_cbp, axis=3, keep_dims=True) text_fc_weights = tf.get_variable( "text_fc_weight", [1024, 7 * 7 * 800], initializer=tf.truncated_normal_initializer( stddev=0.01)) text_fc_bias = tf.get_variable( "text_fc_bias", [7 * 7 * 800], initializer=tf.constant_initializer(0.0)) text_app = tf.nn.bias_add( tf.matmul(text, text_fc_weights), text_fc_bias) text_app = tf.reshape(text_app, [-1, 7, 7, 800]) text_app /= tf.reduce_max(text_app, axis=3, keep_dims=True) with pt.defaults_scope(phase=pt.Phase.train): with tf.variable_scope("g_net", reuse=gpu_id > 0): fake_t2t = self.model.get_generator( image_app_cbp) with tf.variable_scope("g_net", reuse=True): fake_s2t = self.model.get_generator( text_app) with tf.variable_scope("Inception", reuse=True): fake_s2t_app_7x7, fake_s2t_app =\ self.app_encoder.build(fake_s2t) fake_t2t_app_7x7, fake_t2t_app = \ self.app_encoder.build(fake_t2t) fake_s2t_app = tf.tile(fake_s2t_app, [1, 7, 7, 1]) fake_t2t_app = tf.tile(fake_t2t_app, [1, 7, 7, 1]) fake_s2t_app_cbp = tf.reshape( cbp.compact_bilinear_pooling_layer( fake_s2t_app_7x7, fake_s2t_app, 800, sum_pool=False, sequential=False), (self.batch_size, 7, 7, 800)) fake_s2t_app_cbp /= tf.reduce_max(fake_s2t_app_cbp, axis=3, keep_dims=True) fake_t2t_app_cbp = tf.reshape( cbp.compact_bilinear_pooling_layer( fake_t2t_app_7x7, fake_t2t_app, 800, sum_pool=False, sequential=False), (self.batch_size, 7, 7, 800)) fake_t2t_app_cbp /= tf.reduce_max(fake_t2t_app_cbp, axis=3, keep_dims=True) fake_s2t_app_pool = tf.nn.avg_pool( fake_s2t_app_cbp, [1, 7, 7, 1], [1, 1, 1, 1], padding='VALID') fc_weights = tf.get_variable( "fc_weight", [800, 200], initializer=tf.truncated_normal_initializer( stddev=0.01)) fc_bias = tf.get_variable( "fc_bias", [200], initializer=tf.constant_initializer(0.0)) image_fc = tf.reshape( tf.nn.avg_pool(image_app_cbp, [1, 7, 7, 1], [1, 1, 1, 1], padding='VALID'), (-1, 800)) image_fc = tf.nn.bias_add( tf.matmul(image_fc, fc_weights), fc_bias) fake_t2t_fc = tf.reshape( tf.nn.avg_pool(fake_t2t_app_cbp, [1, 7, 7, 1], [1, 1, 1, 1], padding='VALID'), (-1, 800)) fake_t2t_fc = tf.nn.bias_add( tf.matmul(fake_t2t_fc, fc_weights), fc_bias) real_logit = self.model.get_discriminator(image) fake_s2t_logit = self.model.get_discriminator( fake_s2t) fake_t2t_logit = self.model.get_discriminator( fake_t2t) real_d_loss =\ tf.nn.softmax_cross_entropy_with_logits(real_logit, tf.constant([[1.0, 0.0, 0.0]]*self.batch_size)) real_d_loss = tf.reduce_mean(real_d_loss) fake_s2t_d_loss =\ tf.nn.softmax_cross_entropy_with_logits(fake_s2t_logit, tf.constant([[0.0, 1.0, 0.0]]*self.batch_size)) fake_s2t_d_loss = tf.reduce_mean(fake_s2t_d_loss) fake_t2t_d_loss =\ tf.nn.softmax_cross_entropy_with_logits(fake_t2t_logit, tf.constant([[0.0, 0.0, 1.0]]*self.batch_size)) fake_t2t_d_loss = tf.reduce_mean(fake_t2t_d_loss) d_loss =\ real_d_loss + (fake_s2t_d_loss + fake_t2t_d_loss) / 2. fake_s2t_g_loss = \ tf.nn.softmax_cross_entropy_with_logits(fake_s2t_logit, tf.constant([[1.0, 0.0, 0.0]]*self.batch_size)) fake_s2t_g_loss = tf.reduce_mean(fake_s2t_g_loss) fake_t2t_g_loss = \ tf.nn.softmax_cross_entropy_with_logits(fake_t2t_logit, tf.constant([[1.0, 0.0, 0.0]]*self.batch_size)) fake_t2t_g_loss = tf.reduce_mean(fake_t2t_g_loss) f_t_loss = tf.nn.softmax_cross_entropy_with_logits( image_fc, label) f_t_loss = tf.reduce_mean(f_t_loss) f_t2t_loss = tf.nn.softmax_cross_entropy_with_logits( fake_t2t_fc, label) f_t2t_loss = tf.reduce_mean(f_t2t_loss) f_s2t_loss = tf.abs(fake_s2t_app_pool - text_app) f_s2t_loss = tf.reduce_mean(f_s2t_loss) debug_loss1 = f_t_loss debug_loss2 = f_t2t_loss debug_loss3 = f_s2t_loss f_loss = (f_t_loss + f_t2t_loss + f_s2t_loss) / 3. g_loss =\ (fake_s2t_g_loss + fake_t2t_g_loss)/2. + 10. * f_loss t_vars = tf.trainable_variables() g_train_vars = [] d_train_vars = [] for var in t_vars: if var.name.startswith('d_'): d_train_vars.append(var) else: g_train_vars.append(var) d_grad = d_opt.compute_gradients( d_loss, var_list=d_train_vars) g_grad = g_opt.compute_gradients( g_loss, var_list=g_train_vars) self.models.append( (image, text, label, fake_s2t, fake_t2t, g_loss, d_loss, g_grad, d_grad, debug_loss1, debug_loss2, debug_loss3)) print('build model on gpu tower done') _, _, _, _, _, tower_g_loss, tower_d_loss, tower_g_grad, tower_d_grad, loss1, loss2, loss3 = zip( *self.models) self.aver_d_loss = tf.reduce_mean(tower_d_loss) self.aver_g_loss = tf.reduce_mean(tower_g_loss) self.d_op = d_opt.apply_gradients(average_gradients(tower_d_grad)) self.g_op = g_opt.apply_gradients(average_gradients(tower_g_grad)) self.loss1 = tf.reduce_mean(loss1) self.loss2 = tf.reduce_mean(loss2) self.loss3 = tf.reduce_mean(loss3)
def network_fn(images): with slim.arg_scope(arg_scope): frames_per_video = 1 # same for single image datasets if images.get_shape().ndims == 5: im_shape = images.get_shape().as_list() frames_per_video = im_shape[1] images = tf.reshape( images, [-1, im_shape[-3], im_shape[-2], im_shape[-1]]) # Main Network Function kwargs = {} if cfg.NET.DROPOUT >= 0: # if -1, then just ignore it and use nw def. kwargs['dropout_keep_prob'] = (1 - cfg.NET.DROPOUT) logits, end_points = func(images, num_classes, is_training=is_training, train_top_bn=cfg.NET.TRAIN_TOP_BN, **kwargs) # rgirdhar: add another end point for heatmap prediction try: last_conv = end_points[last_conv_map[name]] except: raise ValueError( 'End point {} not found. Choose from: {}'.format( last_conv_map[name], ' '.join(end_points))) random_normal = lambda stddev: tf.random_normal_initializer( 0.0, stddev) with slim.arg_scope([slim.dropout], is_training=is_training, keep_prob=0.2 if cfg.NET.DROPOUT < 0 else (1.0 - cfg.NET.DROPOUT)): with tf.variable_scope('PoseLogits'): last_conv_pose_name = getattr( cfg.NET.LAST_CONV_MAP_FOR_POSE, name) last_conv_pose = end_points[last_conv_pose_name] pose_pre_logits = slim.conv2d( last_conv_pose, 768, [1, 1], weights_initializer=random_normal(0.001), activation_fn=tf.nn.relu, normalizer_fn=None, biases_initializer=tf.zeros_initializer(), padding='SAME', scope='ExtraConv2d_1x1') pose_logits = slim.conv2d(pose_pre_logits, num_pose_keypoints, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') end_points['PoseLogits'] = pose_logits if cfg.NET.USE_POSE_ATTENTION_LOGITS: with tf.variable_scope('PoseAttention'): # use the pose prediction as an attention map to get the features # step1: split pose logits over channels pose_logits_parts = tf.split( pose_logits, pose_logits.get_shape().as_list()[-1], axis=pose_logits.get_shape().ndims - 1) part_logits = [] # allows to choose which dimension of pose to use for heatmaps parts_to_use = pose_logits_parts if cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS != [-1]: parts_to_use = (np.array(pose_logits_parts)[ cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS] ).tolist() tf.logging.info( 'Using {} parts for pose attention logits'.format( len(parts_to_use))) for part in parts_to_use: part_logits.append( tf.reduce_mean(part * last_conv, axis=[1, 2], keep_dims=True)) if cfg.NET.USE_POSE_ATTENTION_LOGITS_AVGED_HMAP: part_logits.append( tf.reduce_mean(last_conv * tf.reduce_mean( pose_logits, axis=-1, keep_dims=True), axis=[1, 2], keep_dims=True)) part_logits.append( tf.reduce_mean(last_conv, axis=[1, 2], keep_dims=True)) net = tf.concat(part_logits, axis=-1) net = slim.dropout(net) logits = slim.conv2d( net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_LOGITS_DIRECTLY: with tf.variable_scope('ActionFromPose'): net = tf.reduce_mean(pose_pre_logits, axis=[1, 2], keep_dims=True) net = slim.conv2d( net, 768, [1, 1], normalizer_fn=None, weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) if cfg.NET.USE_POSE_LOGITS_DIRECTLY_PLUS_LOGITS: net = tf.concat([ net, tf.reduce_mean( last_conv, axis=[1, 2], keep_dims=True) ], axis=-1) net = slim.dropout(net) logits = slim.conv2d( net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2: with tf.variable_scope('ActionFromPose_v2'): net = tf.concat([pose_pre_logits, last_conv], axis=-1) if cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2_EXTRA_LAYER: net = tf.nn.relu(net) net = slim.conv2d( net, net.get_shape().as_list()[-1], [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) net = tf.reduce_mean(net, axis=[1, 2], keep_dims=True) net = slim.dropout(net) logits = slim.conv2d( net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_COMPACT_BILINEAR_POOLING: last_conv_shape = last_conv.get_shape().as_list() net = compact_bilinear_pooling_layer( last_conv, last_conv, last_conv_shape[-1]) net.set_shape([last_conv_shape[0], last_conv_shape[-1]]) net = tf.expand_dims(tf.expand_dims(net, 1), 1) net = slim.dropout(net) logits = slim.conv2d( net, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) elif cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION: with tf.variable_scope('PosePrelogitsBasedAttention'): # If the following is set, just train on top of image features, # don't add the prelogits at all. This was useful as pose seemed to # not help with it at all. if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT: net = last_conv else: net = pose_pre_logits # nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1 # For simplicity, since multiple maps doesn't seem to help, I'm # not allowing that to keep the following code simple. # nMaps = 1 # For NIPS2017 rebuttal, they wanted to see nums with per-class # attention, so doing that too nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1 all_att_logits = [] for rank_id in range( cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK ): scope_name = 'Conv2d_PrePose_Attn' if rank_id >= 1: scope_name += str(rank_id) net = slim.conv2d( net, nMaps, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None, scope=scope_name) all_att_logits.append(net) if len(all_att_logits) > 1: attention_logits = tf.stack(all_att_logits, axis=-1) else: attention_logits = all_att_logits[0] if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SOFTMAX_ATT: # bring the number of channels earlier to make softmax easier attention_logits = tf.transpose( attention_logits, [0, 3, 1, 2]) att_shape = attention_logits.get_shape().as_list() attention_logits = tf.reshape( attention_logits, [att_shape[0], att_shape[1], -1]) attention_logits = tf.nn.softmax(attention_logits) attention_logits = tf.reshape( attention_logits, att_shape) attention_logits = tf.transpose( attention_logits, [0, 2, 3, 1]) if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RELU_ATT: attention_logits = tf.nn.relu(attention_logits) end_points[ 'PosePrelogitsBasedAttention'] = attention_logits if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT: if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT_2LAYER: pose_logits = slim.conv2d( pose_logits, pose_logits.get_shape()[-1], [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer()) last_conv = tf.concat([last_conv, pose_logits], axis=-1) last_conv = slim.dropout(last_conv) # Top-down attention all_logits = [] for _ in range( cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK ): logits = slim.conv2d( last_conv, num_classes, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.zeros_initializer(), activation_fn=None, normalizer_fn=None) all_logits.append(logits) if len(all_logits) > 1: logits = tf.stack(all_logits, axis=-1) else: logits = all_logits[0] end_points['TopDownAttention'] = logits # attended_feats = [] # for attention_logit in tf.unstack(attention_logits, axis=-1): # attended_feats.append(tf.reduce_mean( # tf.expand_dims(attention_logit, axis=-1) * logits, # axis=[1,2], # keep_dims=True)) # attended_feat = tf.stack(attended_feats, axis=-1) # # Since only 1 attention map (asserted above) # logits = attended_feat[..., 0] # better way to do the above: logits = tf.reduce_mean(attention_logits * logits, axis=[1, 2], keep_dims=True) if logits.get_shape().ndims == 5: # i.e. rank was > 1 logits = tf.reduce_sum(logits, axis=-1) # if nMaps == 1: # # remove the extra dimension that is added for multi-class # # attention case # attended_feat = attended_feat[..., 0] # logits = slim.conv2d(attended_feat, num_classes, [1, 1], # weights_initializer=random_normal(0.001), # biases_initializer=tf.zeros_initializer(), # activation_fn=None, # normalizer_fn=None) # else: # logits = tf.concat([ # slim.conv2d(el, 1, [1, 1], # weights_initializer=random_normal(0.001), # biases_initializer=tf.zeros_initializer(), # activation_fn=None, # normalizer_fn=None) for el in # tf.unstack(attended_feat, axis=-1)], axis=-1) # This is just to protect against the case where I don't do any of the # above and get the original logits from the network, which has already # been squeezed, or in case of vgg 16, passed through fc layers if logits.get_shape().ndims > 2: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') end_points['Logits'] = logits if frames_per_video > 1: with tf.name_scope('FramePooling'): # for now stick with avg pool end_points['logits_beforePool'] = logits old_logits = logits logits = tf.stack([ el for el in tf.split( old_logits, int(old_logits.get_shape().as_list()[0] / frames_per_video)) ]) if cfg.NET.USE_TEMPORAL_ATT: with tf.variable_scope('TemporalAttention'): logits = tf.expand_dims(logits, axis=-2) #[bs, 3, 1, nc] logits_att = slim.conv2d( logits, 1, [1, 1], weights_initializer=random_normal(0.001), biases_initializer=tf.constant_initializer( 1.0 / logits.get_shape().as_list()[1]), activation_fn=None, normalizer_fn=None) logits = logits * logits_att logits = tf.squeeze(logits, axis=-2) end_points['TemporalAttention'] = logits_att logits = tf.reduce_mean(logits, axis=1) return logits, end_points
padding='SAME') mnist_relu2 = tf.nn.relu(mnist_conv2) mnist_relu2 = tf.nn.relu( tf.nn.bias_add(mnist_conv2, f_biases['conv_2_biases'])) mnist_pool2 = tf.nn.max_pool(mnist_relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') mnist_gpool = tf.nn.avg_pool(mnist_pool2, ksize=[1, 7, 7, 1], strides=[1, 1, 1, 1], padding='VALID') mnist_gpool = tf.tile(mnist_gpool, [1, 7, 7, 1]) mnist_cbp = tf.reshape( cbp.compact_bilinear_pooling_layer( mnist_pool2, mnist_gpool, 128, sum_pool=False, sequential=False), [batch_size, 7, 7, 128]) / 10000. mnist_flatten = tf.reshape(mnist_cbp, [batch_size, 7 * 7 * 128]) mnist_fc1 = tf.nn.relu( tf.matmul(mnist_flatten, f_weights['fc1_weights']) + f_biases['fc1_biases']) mnist_fc2 = tf.nn.relu( tf.matmul(mnist_fc1, f_weights['fc2_weights']) + f_biases['fc2_biases']) with tf.variable_scope("LeNet", reuse=True): svhn_conv1 = tf.nn.conv2d(svhn, f_weights['conv_1_weights'], strides=[1, 1, 1, 1], padding='SAME') svhn_relu1 = tf.nn.relu(
for n in xrange(len(output)): output[n, ...] = np.outer(bottom1_flat[n], bottom2_flat[n]).reshape(-1) output = output.reshape((batch_size, height, width, output_dim)) if sum_pool: output = np.sum(output, axis=(1, 2)) return output # Input and output tensors # Input channels need to be specified for shape inference input_dim1 = 2048 input_dim2 = 2048 output_dim = 16000 bottom1 = tf.placeholder(tf.float32, [None, None, None, input_dim1]) bottom2 = tf.placeholder(tf.float32, [None, None, None, input_dim2]) top = compact_bilinear_pooling_layer(bottom1, bottom2, output_dim, sum_pool=True) def cbp(bottom1_value, bottom2_value): sess = tf.get_default_session() return sess.run(top, feed_dict={bottom1: bottom1_value, bottom2: bottom2_value}) def run_kernel_approximation_test(batch_size, height, width): # Input values x = np.random.rand(batch_size, height, width, input_dim1).astype(np.float32) y = np.random.rand(batch_size, height, width, input_dim2).astype(np.float32) z = np.random.rand(batch_size, height, width, input_dim1).astype(np.float32) w = np.random.rand(batch_size, height, width, input_dim2).astype(np.float32) # Compact Bilinear Pooling results cbp_xy = cbp(x, y)