def Prediction_network(self):
        RGB_inputs = tf.keras.Input(shape=(self.inputH, self.inputW,
                                           self.channel))
        Noise_inputs = tf.keras.Input(shape=(self.inputH, self.inputW,
                                             self.channel))

        RGB_outputs = self.RGB_net(RGB_inputs)
        Noise_outputs = self.Noise_net(Noise_inputs)

        cbp = compact_bilinear_pooling_layer(RGB_outputs, Noise_outputs, 256)
        cbp_flat = tf.keras.layers.Flatten()(cbp)
        fc_1 = tf.keras.layers.Dense(256,
                                     activation="relu",
                                     kernel_initializer='he_normal')(cbp_flat)
        fc_1_dropout = tf.keras.layers.Dropout(0.5)(fc_1)
        fc_2 = tf.keras.layers.Dense(
            256, activation="relu",
            kernel_initializer='he_normal')(fc_1_dropout)
        fc_2_dropout = tf.keras.layers.Dropout(0.5)(fc_2)

        outputs = tf.keras.layers.Dense(2, activation="softmax")(fc_2_dropout)

        model = tf.keras.Model(inputs=[RGB_inputs, Noise_inputs],
                               outputs=outputs)
        return model
コード例 #2
0
def configure_inference_and_loss(inputs, labels, net, arg_scope, num_classes,
                                 is_training):
    with slim.arg_scope(arg_scope):
        # Load inception_v3 as model 1
        with tf.variable_scope('model_1'):
            _, end_points_1 = nets_factory.networks_map[net](
                inputs=inputs,
                num_classes=num_classes,
                is_training=is_training)

        # Load inception_v3 as model 2
        with tf.variable_scope('model_2'):
            _, end_points_2 = nets_factory.networks_map[net](
                inputs=inputs,
                num_classes=num_classes,
                is_training=is_training)
    # Add Compact Pooling Layer
    height, width = end_points_1['Mixed_7c'].get_shape().as_list()[1:3]
    output_dim = 16000
    with tf.variable_scope('Compact_Pooling'):
        net = compact_bilinear_pooling_layer(end_points_1['Mixed_7c'],
                                             end_points_2['Mixed_7c'],
                                             output_dim,
                                             sum_pool=False)
        net = tf.reshape(net, [-1, height, width, output_dim])

    # Add Logits output
    with slim.arg_scope(arg_scope):
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):
            with tf.variable_scope('Compact_Pooling/Logits'):
                net = slim.conv2d(net,
                                  512, [3, 3],
                                  padding='SAME',
                                  scope='Conv2d_1b_3x3')
                kernel_size = [height, width]
                net = slim.avg_pool2d(
                    net,
                    kernel_size,
                    padding='VALID',
                    scope='AvgPool_1a_{}x{}'.format(*kernel_size))
                tf.summary.histogram('pre_logits', net)
                net = slim.dropout(net,
                                   scope='Dropout_1b',
                                   is_training=is_training)
                logits = slim.conv2d(net,
                                     num_classes, [1, 1],
                                     activation_fn=None,
                                     normalizer_fn=None,
                                     scope='Conv2d_1c_1x1')
                logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
                predictions = slim.softmax(logits, scope='Predictions')

    loss = tf.losses.softmax_cross_entropy(logits=logits,
                                           onehot_labels=labels,
                                           label_smoothing=0.,
                                           weights=1.0)
    return predictions, loss
コード例 #3
0
def _configure_loss_function(inputs, labels, net, arg_scope, num_classes, 
                             dropout_keep_prob, label_smoothing, is_training):
    with slim.arg_scope(arg_scope):

        
        _, end_points = nets_factory.networks_map[net](
            inputs=inputs,
            num_classes=num_classes,
            dropout_keep_prob=dropout_keep_prob,
            is_training=is_training
        )
    # Add Compact Pooling Layer
    height, width = end_points['Mixed_7c'].get_shape().as_list()[1:3]
    output_dim = 16000
    with tf.variable_scope('Compact_Pooling'):
        net = compact_bilinear_pooling_layer(end_points['Mixed_7c'], end_points['Mixed_7c'], output_dim, sum_pool=False)
        net = tf.reshape(net, [-1, height, width, output_dim])

    # Add Logits output
    with slim.arg_scope(arg_scope):
        with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training):
            with tf.variable_scope('Compact_Pooling/Logits'):
                net = slim.conv2d(net, 512, [3, 3], padding='SAME', scope='Conv2d_1b_3x3')
                kernel_size = [height, width]
                net = slim.avg_pool2d(net, kernel_size, padding='VALID',
                                      scope='AvgPool_1a_{}x{}'.format(*kernel_size))
                tf.summary.histogram('pre_logits', net)
                net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b')
                logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None,
                                     normalizer_fn=None, scope='Conv2d_1c_1x1')
                logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')


    # Add loss function
    tf.losses.softmax_cross_entropy(logits=end_points['AuxLogits'], onehot_labels=labels,
                                    label_smoothing=label_smoothing, weights=0.4, scope='aux_loss')

    tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=labels, 
                                    label_smoothing=label_smoothing, weights=1.0)
コード例 #4
0
        output[n, ...] = np.outer(bottom1_flat[n], bottom2_flat[n]).reshape(-1)
    output = output.reshape((batch_size, height, width, output_dim))

    if sum_pool:
        output = np.sum(output, axis=(1, 2))
    return output


# Input and output tensors
# Input channels need to be specified for shape inference
input_dim1 = 2048
input_dim2 = 2048
output_dim = 16000
bottom1 = tf.placeholder(tf.float32, [None, None, None, input_dim1])
bottom2 = tf.placeholder(tf.float32, [None, None, None, input_dim2])
top = compact_bilinear_pooling_layer(bottom1, bottom2, output_dim, sum_pool=True)
grad = tf.gradients(top, [bottom1, bottom2])


def cbp(bottom1_value, bottom2_value):
    sess = tf.get_default_session()
    return sess.run(top, feed_dict={bottom1: bottom1_value,
                                    bottom2: bottom2_value})


def cbp_with_grad(bottom1_value, bottom2_value):
    sess = tf.get_default_session()
    return sess.run([top] + grad, feed_dict={bottom1: bottom1_value,
                                             bottom2: bottom2_value})

コード例 #5
0
def cbp(x1, x2, in_size, dim, size):
    v = compact_bilinear_pooling_layer(x1, x2, in_size, dim, sum_pool=True)
    v.set_shape([size, dim])

    return v
コード例 #6
0
def vgg_16_cbcnn(input_shape,
                 no_classes,
                 bilinear_output_dim,
                 sum_pool=True,
                 weight_decay_constant=5e-4,
                 multi_label=False,
                 weights_path=None):

    weights_regularizer = regularizers.l2(weight_decay_constant)

    # Input layer
    img_input = Input(shape=input_shape, name='spectr_input')

    # Block 1
    x = Conv2D(64, (3, 3),
               activation='relu',
               padding='same',
               name='block1_conv1',
               kernel_regularizer=weights_regularizer)(img_input)
    x = Conv2D(64, (3, 3),
               activation='relu',
               padding='same',
               name='block1_conv2',
               kernel_regularizer=weights_regularizer)(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block1_pool')(x)

    # Block 2
    x = Conv2D(128, (3, 3),
               activation='relu',
               padding='same',
               name='block2_conv1',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(128, (3, 3),
               activation='relu',
               padding='same',
               name='block2_conv2',
               kernel_regularizer=weights_regularizer)(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block2_pool')(x)

    # Block 3
    x = Conv2D(256, (3, 3),
               activation='relu',
               padding='same',
               name='block3_conv1',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(256, (3, 3),
               activation='relu',
               padding='same',
               name='block3_conv2',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(256, (3, 3),
               activation='relu',
               padding='same',
               name='block3_conv3',
               kernel_regularizer=weights_regularizer)(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block3_pool')(x)

    # Block 4
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block4_conv1',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block4_conv2',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block4_conv3',
               kernel_regularizer=weights_regularizer)(x)
    x = MaxPooling2D((2, 2), strides=(2, 2), name='block4_pool')(x)

    # Block 5
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block5_conv1',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block5_conv2',
               kernel_regularizer=weights_regularizer)(x)
    x = Conv2D(512, (3, 3),
               activation='relu',
               padding='same',
               name='block5_conv3',
               kernel_regularizer=weights_regularizer)(x)

    # Merge using compact bilinear method
    # dummy_tensor_for_output_dim = K.placeholder(shape=(bilinear_output_dim,))
    compact_bilinear_arg_list = [x, x]

    output_shape_x = x.get_shape().as_list()[1:]
    output_shape_cb = (output_shape_x[0], output_shape_x[1],
                       bilinear_output_dim)
    cbp = compact_bilinear_pooling_layer(x,
                                         x,
                                         bilinear_output_dim,
                                         sum_pool=True)

    # If sum_pool=True do a global sum pooling
    if sum_pool:
        # Since using tf. Hence 3rd would represent channels
        x = Lambda(lambda x: K.sum(x, axis=[1, 2]))(cbp)

    # Sign sqrt and L2 normalize result
    x = Lambda(lambda x: K.sign(x) * K.sqrt(K.abs(x)))(x)
    x = Lambda(lambda x: K.l2_normalize(x, axis=-1))(x)

    # final dense layer
    if not multi_label:
        final_activation = 'softmax'
    else:
        final_activation = 'sigmoid'
    x = Dense(no_classes,
              activation=final_activation,
              name='softmax_layer',
              kernel_regularizer=weights_regularizer)(x)

    # Put together input and output to form model
    model = Model(inputs=[img_input], outputs=[x])
    if weights_path:
        model.load_weights(weights_path, by_name=True)
    return model
コード例 #7
0
    def init_opt(self):
        '''Helper function for init_opt'''

        self.app_encoder = AppEncoder()

        with tf.device('/cpu:0'):
            self.g_lr = tf.placeholder(tf.float32, [],
                                       name='generator_learning_rate')
            self.d_lr = tf.placeholder(tf.float32, [],
                                       name='discriminator_learning_rate')

            g_opt = tf.train.AdamOptimizer(self.g_lr, beta1=0.5)
            d_opt = tf.train.AdamOptimizer(self.d_lr, beta1=0.5)
            self.models = []
            self.num_gpu = 2

            for gpu_id in range(self.num_gpu):
                with tf.device('/gpu:%d' % gpu_id):
                    with tf.name_scope('tower_%d' % gpu_id):
                        with tf.variable_scope('cpu_variables',
                                               reuse=gpu_id > 0):
                            image = tf.placeholder(
                                tf.float32, [self.batch_size, 224, 224, 3],
                                name='image')

                            text = tf.placeholder(tf.float32,
                                                  [self.batch_size, 1024],
                                                  name='text')

                            label = tf.placeholder(tf.float32,
                                                   [self.batch_size, 200],
                                                   name="label")

                            with tf.variable_scope("Inception",
                                                   reuse=gpu_id > 0):
                                image_app_7x7, image_app =\
                                    self.app_encoder.build(image)
                            image_app = tf.tile(image_app, [1, 7, 7, 1])

                            image_app_cbp = tf.reshape(
                                cbp.compact_bilinear_pooling_layer(
                                    image_app_7x7,
                                    image_app,
                                    800,
                                    sum_pool=False,
                                    sequential=False),
                                (self.batch_size, 7, 7, 800))
                            image_app_cbp /= tf.reduce_max(image_app_cbp,
                                                           axis=3,
                                                           keep_dims=True)

                            text_fc_weights = tf.get_variable(
                                "text_fc_weight", [1024, 7 * 7 * 800],
                                initializer=tf.truncated_normal_initializer(
                                    stddev=0.01))
                            text_fc_bias = tf.get_variable(
                                "text_fc_bias", [7 * 7 * 800],
                                initializer=tf.constant_initializer(0.0))
                            text_app = tf.nn.bias_add(
                                tf.matmul(text, text_fc_weights), text_fc_bias)
                            text_app = tf.reshape(text_app, [-1, 7, 7, 800])
                            text_app /= tf.reduce_max(text_app,
                                                      axis=3,
                                                      keep_dims=True)

                            with pt.defaults_scope(phase=pt.Phase.train):
                                with tf.variable_scope("g_net",
                                                       reuse=gpu_id > 0):
                                    fake_t2t = self.model.get_generator(
                                        image_app_cbp)
                                with tf.variable_scope("g_net", reuse=True):
                                    fake_s2t = self.model.get_generator(
                                        text_app)

                            with tf.variable_scope("Inception", reuse=True):
                                fake_s2t_app_7x7, fake_s2t_app =\
                                        self.app_encoder.build(fake_s2t)
                                fake_t2t_app_7x7, fake_t2t_app = \
                                        self.app_encoder.build(fake_t2t)

                            fake_s2t_app = tf.tile(fake_s2t_app, [1, 7, 7, 1])
                            fake_t2t_app = tf.tile(fake_t2t_app, [1, 7, 7, 1])

                            fake_s2t_app_cbp = tf.reshape(
                                cbp.compact_bilinear_pooling_layer(
                                    fake_s2t_app_7x7,
                                    fake_s2t_app,
                                    800,
                                    sum_pool=False,
                                    sequential=False),
                                (self.batch_size, 7, 7, 800))
                            fake_s2t_app_cbp /= tf.reduce_max(fake_s2t_app_cbp,
                                                              axis=3,
                                                              keep_dims=True)
                            fake_t2t_app_cbp = tf.reshape(
                                cbp.compact_bilinear_pooling_layer(
                                    fake_t2t_app_7x7,
                                    fake_t2t_app,
                                    800,
                                    sum_pool=False,
                                    sequential=False),
                                (self.batch_size, 7, 7, 800))
                            fake_t2t_app_cbp /= tf.reduce_max(fake_t2t_app_cbp,
                                                              axis=3,
                                                              keep_dims=True)

                            fake_s2t_app_pool = tf.nn.avg_pool(
                                fake_s2t_app_cbp, [1, 7, 7, 1], [1, 1, 1, 1],
                                padding='VALID')

                            fc_weights = tf.get_variable(
                                "fc_weight", [800, 200],
                                initializer=tf.truncated_normal_initializer(
                                    stddev=0.01))
                            fc_bias = tf.get_variable(
                                "fc_bias", [200],
                                initializer=tf.constant_initializer(0.0))

                            image_fc = tf.reshape(
                                tf.nn.avg_pool(image_app_cbp, [1, 7, 7, 1],
                                               [1, 1, 1, 1],
                                               padding='VALID'), (-1, 800))
                            image_fc = tf.nn.bias_add(
                                tf.matmul(image_fc, fc_weights), fc_bias)
                            fake_t2t_fc = tf.reshape(
                                tf.nn.avg_pool(fake_t2t_app_cbp, [1, 7, 7, 1],
                                               [1, 1, 1, 1],
                                               padding='VALID'), (-1, 800))
                            fake_t2t_fc = tf.nn.bias_add(
                                tf.matmul(fake_t2t_fc, fc_weights), fc_bias)

                            real_logit = self.model.get_discriminator(image)
                            fake_s2t_logit = self.model.get_discriminator(
                                fake_s2t)
                            fake_t2t_logit = self.model.get_discriminator(
                                fake_t2t)

                            real_d_loss =\
                                tf.nn.softmax_cross_entropy_with_logits(real_logit,
                                                                tf.constant([[1.0, 0.0, 0.0]]*self.batch_size))
                            real_d_loss = tf.reduce_mean(real_d_loss)

                            fake_s2t_d_loss =\
                                tf.nn.softmax_cross_entropy_with_logits(fake_s2t_logit,
                                                                tf.constant([[0.0, 1.0, 0.0]]*self.batch_size))
                            fake_s2t_d_loss = tf.reduce_mean(fake_s2t_d_loss)

                            fake_t2t_d_loss =\
                                tf.nn.softmax_cross_entropy_with_logits(fake_t2t_logit,
                                                                tf.constant([[0.0, 0.0, 1.0]]*self.batch_size))
                            fake_t2t_d_loss = tf.reduce_mean(fake_t2t_d_loss)

                            d_loss =\
                                real_d_loss + (fake_s2t_d_loss + fake_t2t_d_loss) / 2.


                            fake_s2t_g_loss = \
                                tf.nn.softmax_cross_entropy_with_logits(fake_s2t_logit,
                                                                tf.constant([[1.0, 0.0, 0.0]]*self.batch_size))
                            fake_s2t_g_loss = tf.reduce_mean(fake_s2t_g_loss)

                            fake_t2t_g_loss = \
                                tf.nn.softmax_cross_entropy_with_logits(fake_t2t_logit,
                                                                tf.constant([[1.0, 0.0, 0.0]]*self.batch_size))
                            fake_t2t_g_loss = tf.reduce_mean(fake_t2t_g_loss)

                            f_t_loss = tf.nn.softmax_cross_entropy_with_logits(
                                image_fc, label)
                            f_t_loss = tf.reduce_mean(f_t_loss)
                            f_t2t_loss = tf.nn.softmax_cross_entropy_with_logits(
                                fake_t2t_fc, label)

                            f_t2t_loss = tf.reduce_mean(f_t2t_loss)
                            f_s2t_loss = tf.abs(fake_s2t_app_pool - text_app)
                            f_s2t_loss = tf.reduce_mean(f_s2t_loss)

                            debug_loss1 = f_t_loss
                            debug_loss2 = f_t2t_loss
                            debug_loss3 = f_s2t_loss

                            f_loss = (f_t_loss + f_t2t_loss + f_s2t_loss) / 3.

                            g_loss =\
                                 (fake_s2t_g_loss + fake_t2t_g_loss)/2. + 10. * f_loss

                            t_vars = tf.trainable_variables()
                            g_train_vars = []
                            d_train_vars = []

                            for var in t_vars:
                                if var.name.startswith('d_'):
                                    d_train_vars.append(var)
                                else:
                                    g_train_vars.append(var)

                            d_grad = d_opt.compute_gradients(
                                d_loss, var_list=d_train_vars)
                            g_grad = g_opt.compute_gradients(
                                g_loss, var_list=g_train_vars)

                            self.models.append(
                                (image, text, label, fake_s2t, fake_t2t,
                                 g_loss, d_loss, g_grad, d_grad, debug_loss1,
                                 debug_loss2, debug_loss3))

            print('build model on gpu tower done')
            _, _, _, _, _, tower_g_loss, tower_d_loss, tower_g_grad, tower_d_grad, loss1, loss2, loss3 = zip(
                *self.models)

            self.aver_d_loss = tf.reduce_mean(tower_d_loss)
            self.aver_g_loss = tf.reduce_mean(tower_g_loss)
            self.d_op = d_opt.apply_gradients(average_gradients(tower_d_grad))
            self.g_op = g_opt.apply_gradients(average_gradients(tower_g_grad))

            self.loss1 = tf.reduce_mean(loss1)
            self.loss2 = tf.reduce_mean(loss2)
            self.loss3 = tf.reduce_mean(loss3)
コード例 #8
0
    def network_fn(images):
        with slim.arg_scope(arg_scope):
            frames_per_video = 1  # same for single image datasets
            if images.get_shape().ndims == 5:
                im_shape = images.get_shape().as_list()
                frames_per_video = im_shape[1]
                images = tf.reshape(
                    images, [-1, im_shape[-3], im_shape[-2], im_shape[-1]])

            # Main Network Function
            kwargs = {}
            if cfg.NET.DROPOUT >= 0:  # if -1, then just ignore it and use nw def.
                kwargs['dropout_keep_prob'] = (1 - cfg.NET.DROPOUT)
            logits, end_points = func(images,
                                      num_classes,
                                      is_training=is_training,
                                      train_top_bn=cfg.NET.TRAIN_TOP_BN,
                                      **kwargs)

            # rgirdhar: add another end point for heatmap prediction
            try:
                last_conv = end_points[last_conv_map[name]]
            except:
                raise ValueError(
                    'End point {} not found. Choose from: {}'.format(
                        last_conv_map[name], ' '.join(end_points)))
            random_normal = lambda stddev: tf.random_normal_initializer(
                0.0, stddev)

            with slim.arg_scope([slim.dropout],
                                is_training=is_training,
                                keep_prob=0.2 if cfg.NET.DROPOUT < 0 else
                                (1.0 - cfg.NET.DROPOUT)):
                with tf.variable_scope('PoseLogits'):
                    last_conv_pose_name = getattr(
                        cfg.NET.LAST_CONV_MAP_FOR_POSE, name)
                    last_conv_pose = end_points[last_conv_pose_name]
                    pose_pre_logits = slim.conv2d(
                        last_conv_pose,
                        768, [1, 1],
                        weights_initializer=random_normal(0.001),
                        activation_fn=tf.nn.relu,
                        normalizer_fn=None,
                        biases_initializer=tf.zeros_initializer(),
                        padding='SAME',
                        scope='ExtraConv2d_1x1')
                    pose_logits = slim.conv2d(pose_pre_logits,
                                              num_pose_keypoints, [1, 1],
                                              activation_fn=None,
                                              normalizer_fn=None,
                                              scope='Conv2d_1c_1x1')
                    end_points['PoseLogits'] = pose_logits

                if cfg.NET.USE_POSE_ATTENTION_LOGITS:
                    with tf.variable_scope('PoseAttention'):
                        # use the pose prediction as an attention map to get the features
                        # step1: split pose logits over channels
                        pose_logits_parts = tf.split(
                            pose_logits,
                            pose_logits.get_shape().as_list()[-1],
                            axis=pose_logits.get_shape().ndims - 1)
                        part_logits = []
                        # allows to choose which dimension of pose to use for heatmaps
                        parts_to_use = pose_logits_parts
                        if cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS != [-1]:
                            parts_to_use = (np.array(pose_logits_parts)[
                                cfg.NET.USE_POSE_ATTENTION_LOGITS_DIMS]
                                            ).tolist()
                        tf.logging.info(
                            'Using {} parts for pose attention logits'.format(
                                len(parts_to_use)))
                        for part in parts_to_use:
                            part_logits.append(
                                tf.reduce_mean(part * last_conv,
                                               axis=[1, 2],
                                               keep_dims=True))
                        if cfg.NET.USE_POSE_ATTENTION_LOGITS_AVGED_HMAP:
                            part_logits.append(
                                tf.reduce_mean(last_conv * tf.reduce_mean(
                                    pose_logits, axis=-1, keep_dims=True),
                                               axis=[1, 2],
                                               keep_dims=True))
                        part_logits.append(
                            tf.reduce_mean(last_conv,
                                           axis=[1, 2],
                                           keep_dims=True))
                        net = tf.concat(part_logits, axis=-1)
                        net = slim.dropout(net)
                        logits = slim.conv2d(
                            net,
                            num_classes, [1, 1],
                            weights_initializer=random_normal(0.001),
                            biases_initializer=tf.zeros_initializer(),
                            activation_fn=None,
                            normalizer_fn=None)
                elif cfg.NET.USE_POSE_LOGITS_DIRECTLY:
                    with tf.variable_scope('ActionFromPose'):
                        net = tf.reduce_mean(pose_pre_logits,
                                             axis=[1, 2],
                                             keep_dims=True)
                        net = slim.conv2d(
                            net,
                            768, [1, 1],
                            normalizer_fn=None,
                            weights_initializer=random_normal(0.001),
                            biases_initializer=tf.zeros_initializer())
                        if cfg.NET.USE_POSE_LOGITS_DIRECTLY_PLUS_LOGITS:
                            net = tf.concat([
                                net,
                                tf.reduce_mean(
                                    last_conv, axis=[1, 2], keep_dims=True)
                            ],
                                            axis=-1)
                        net = slim.dropout(net)
                        logits = slim.conv2d(
                            net,
                            num_classes, [1, 1],
                            weights_initializer=random_normal(0.001),
                            biases_initializer=tf.zeros_initializer(),
                            activation_fn=None,
                            normalizer_fn=None)
                elif cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2:
                    with tf.variable_scope('ActionFromPose_v2'):
                        net = tf.concat([pose_pre_logits, last_conv], axis=-1)
                        if cfg.NET.USE_POSE_LOGITS_DIRECTLY_v2_EXTRA_LAYER:
                            net = tf.nn.relu(net)
                            net = slim.conv2d(
                                net,
                                net.get_shape().as_list()[-1], [1, 1],
                                weights_initializer=random_normal(0.001),
                                biases_initializer=tf.zeros_initializer())
                        net = tf.reduce_mean(net, axis=[1, 2], keep_dims=True)
                        net = slim.dropout(net)
                        logits = slim.conv2d(
                            net,
                            num_classes, [1, 1],
                            weights_initializer=random_normal(0.001),
                            biases_initializer=tf.zeros_initializer(),
                            activation_fn=None,
                            normalizer_fn=None)
                elif cfg.NET.USE_COMPACT_BILINEAR_POOLING:
                    last_conv_shape = last_conv.get_shape().as_list()
                    net = compact_bilinear_pooling_layer(
                        last_conv, last_conv, last_conv_shape[-1])
                    net.set_shape([last_conv_shape[0], last_conv_shape[-1]])
                    net = tf.expand_dims(tf.expand_dims(net, 1), 1)
                    net = slim.dropout(net)
                    logits = slim.conv2d(
                        net,
                        num_classes, [1, 1],
                        weights_initializer=random_normal(0.001),
                        biases_initializer=tf.zeros_initializer(),
                        activation_fn=None,
                        normalizer_fn=None)
                elif cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION:
                    with tf.variable_scope('PosePrelogitsBasedAttention'):
                        # If the following is set, just train on top of image features,
                        # don't add the prelogits at all. This was useful as pose seemed to
                        # not help with it at all.
                        if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SINGLE_LAYER_ATT:
                            net = last_conv
                        else:
                            net = pose_pre_logits
                        # nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1
                        # For simplicity, since multiple maps doesn't seem to help, I'm
                        # not allowing that to keep the following code simple.
                        # nMaps = 1
                        # For NIPS2017 rebuttal, they wanted to see nums with per-class
                        # attention, so doing that too
                        nMaps = num_classes if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_PER_CLASS else 1
                        all_att_logits = []
                        for rank_id in range(
                                cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK
                        ):
                            scope_name = 'Conv2d_PrePose_Attn'
                            if rank_id >= 1:
                                scope_name += str(rank_id)
                            net = slim.conv2d(
                                net,
                                nMaps, [1, 1],
                                weights_initializer=random_normal(0.001),
                                biases_initializer=tf.zeros_initializer(),
                                activation_fn=None,
                                normalizer_fn=None,
                                scope=scope_name)
                            all_att_logits.append(net)
                        if len(all_att_logits) > 1:
                            attention_logits = tf.stack(all_att_logits,
                                                        axis=-1)
                        else:
                            attention_logits = all_att_logits[0]

                        if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_SOFTMAX_ATT:
                            # bring the number of channels earlier to make softmax easier
                            attention_logits = tf.transpose(
                                attention_logits, [0, 3, 1, 2])
                            att_shape = attention_logits.get_shape().as_list()
                            attention_logits = tf.reshape(
                                attention_logits,
                                [att_shape[0], att_shape[1], -1])
                            attention_logits = tf.nn.softmax(attention_logits)
                            attention_logits = tf.reshape(
                                attention_logits, att_shape)
                            attention_logits = tf.transpose(
                                attention_logits, [0, 2, 3, 1])
                        if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RELU_ATT:
                            attention_logits = tf.nn.relu(attention_logits)
                        end_points[
                            'PosePrelogitsBasedAttention'] = attention_logits

                        if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT:
                            if cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_WITH_POSE_FEAT_2LAYER:
                                pose_logits = slim.conv2d(
                                    pose_logits,
                                    pose_logits.get_shape()[-1], [1, 1],
                                    weights_initializer=random_normal(0.001),
                                    biases_initializer=tf.zeros_initializer())
                            last_conv = tf.concat([last_conv, pose_logits],
                                                  axis=-1)
                        last_conv = slim.dropout(last_conv)
                        # Top-down attention
                        all_logits = []
                        for _ in range(
                                cfg.NET.USE_POSE_PRELOGITS_BASED_ATTENTION_RANK
                        ):
                            logits = slim.conv2d(
                                last_conv,
                                num_classes, [1, 1],
                                weights_initializer=random_normal(0.001),
                                biases_initializer=tf.zeros_initializer(),
                                activation_fn=None,
                                normalizer_fn=None)
                            all_logits.append(logits)
                        if len(all_logits) > 1:
                            logits = tf.stack(all_logits, axis=-1)
                        else:
                            logits = all_logits[0]
                        end_points['TopDownAttention'] = logits

                        # attended_feats = []
                        # for attention_logit in tf.unstack(attention_logits, axis=-1):
                        #   attended_feats.append(tf.reduce_mean(
                        #     tf.expand_dims(attention_logit, axis=-1) * logits,
                        #     axis=[1,2],
                        #     keep_dims=True))
                        # attended_feat = tf.stack(attended_feats, axis=-1)
                        # # Since only 1 attention map (asserted above)
                        # logits = attended_feat[..., 0]

                        # better way to do the above:
                        logits = tf.reduce_mean(attention_logits * logits,
                                                axis=[1, 2],
                                                keep_dims=True)
                        if logits.get_shape().ndims == 5:
                            # i.e. rank was > 1
                            logits = tf.reduce_sum(logits, axis=-1)

                        # if nMaps == 1:
                        #   # remove the extra dimension that is added for multi-class
                        #   # attention case
                        #   attended_feat = attended_feat[..., 0]
                        #   logits = slim.conv2d(attended_feat, num_classes, [1, 1],
                        #                        weights_initializer=random_normal(0.001),
                        #                        biases_initializer=tf.zeros_initializer(),
                        #                        activation_fn=None,
                        #                        normalizer_fn=None)
                        # else:
                        #   logits = tf.concat([
                        #     slim.conv2d(el, 1, [1, 1],
                        #                 weights_initializer=random_normal(0.001),
                        #                 biases_initializer=tf.zeros_initializer(),
                        #                 activation_fn=None,
                        #                 normalizer_fn=None) for el in
                        #     tf.unstack(attended_feat, axis=-1)], axis=-1)
                # This is just to protect against the case where I don't do any of the
                # above and get the original logits from the network, which has already
                # been squeezed, or in case of vgg 16, passed through fc layers
                if logits.get_shape().ndims > 2:
                    logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
                end_points['Logits'] = logits

            if frames_per_video > 1:
                with tf.name_scope('FramePooling'):
                    # for now stick with avg pool
                    end_points['logits_beforePool'] = logits
                    old_logits = logits
                    logits = tf.stack([
                        el for el in tf.split(
                            old_logits,
                            int(old_logits.get_shape().as_list()[0] /
                                frames_per_video))
                    ])
                    if cfg.NET.USE_TEMPORAL_ATT:
                        with tf.variable_scope('TemporalAttention'):
                            logits = tf.expand_dims(logits,
                                                    axis=-2)  #[bs, 3, 1, nc]
                            logits_att = slim.conv2d(
                                logits,
                                1, [1, 1],
                                weights_initializer=random_normal(0.001),
                                biases_initializer=tf.constant_initializer(
                                    1.0 / logits.get_shape().as_list()[1]),
                                activation_fn=None,
                                normalizer_fn=None)
                            logits = logits * logits_att
                            logits = tf.squeeze(logits, axis=-2)
                            end_points['TemporalAttention'] = logits_att
                    logits = tf.reduce_mean(logits, axis=1)
            return logits, end_points
コード例 #9
0
ファイル: train_icml.py プロジェクト: yunyikristy/nips2019
                               padding='SAME')
    mnist_relu2 = tf.nn.relu(mnist_conv2)
    mnist_relu2 = tf.nn.relu(
        tf.nn.bias_add(mnist_conv2, f_biases['conv_2_biases']))
    mnist_pool2 = tf.nn.max_pool(mnist_relu2,
                                 ksize=[1, 2, 2, 1],
                                 strides=[1, 2, 2, 1],
                                 padding='SAME')

    mnist_gpool = tf.nn.avg_pool(mnist_pool2,
                                 ksize=[1, 7, 7, 1],
                                 strides=[1, 1, 1, 1],
                                 padding='VALID')
    mnist_gpool = tf.tile(mnist_gpool, [1, 7, 7, 1])
    mnist_cbp = tf.reshape(
        cbp.compact_bilinear_pooling_layer(
            mnist_pool2, mnist_gpool, 128, sum_pool=False, sequential=False),
        [batch_size, 7, 7, 128]) / 10000.
    mnist_flatten = tf.reshape(mnist_cbp, [batch_size, 7 * 7 * 128])
    mnist_fc1 = tf.nn.relu(
        tf.matmul(mnist_flatten, f_weights['fc1_weights']) +
        f_biases['fc1_biases'])
    mnist_fc2 = tf.nn.relu(
        tf.matmul(mnist_fc1, f_weights['fc2_weights']) +
        f_biases['fc2_biases'])

with tf.variable_scope("LeNet", reuse=True):
    svhn_conv1 = tf.nn.conv2d(svhn,
                              f_weights['conv_1_weights'],
                              strides=[1, 1, 1, 1],
                              padding='SAME')
    svhn_relu1 = tf.nn.relu(
    for n in xrange(len(output)):
        output[n, ...] = np.outer(bottom1_flat[n], bottom2_flat[n]).reshape(-1)
    output = output.reshape((batch_size, height, width, output_dim))
    
    if sum_pool:
        output = np.sum(output, axis=(1, 2))
    return output

# Input and output tensors
# Input channels need to be specified for shape inference
input_dim1 = 2048
input_dim2 = 2048
output_dim = 16000
bottom1 = tf.placeholder(tf.float32, [None, None, None, input_dim1])
bottom2 = tf.placeholder(tf.float32, [None, None, None, input_dim2])
top = compact_bilinear_pooling_layer(bottom1, bottom2, output_dim, sum_pool=True)
def cbp(bottom1_value, bottom2_value):
    sess = tf.get_default_session()
    return sess.run(top, feed_dict={bottom1: bottom1_value,
                                    bottom2: bottom2_value})

def run_kernel_approximation_test(batch_size, height, width):
    # Input values
    x = np.random.rand(batch_size, height, width, input_dim1).astype(np.float32)
    y = np.random.rand(batch_size, height, width, input_dim2).astype(np.float32)

    z = np.random.rand(batch_size, height, width, input_dim1).astype(np.float32)
    w = np.random.rand(batch_size, height, width, input_dim2).astype(np.float32)
    
    # Compact Bilinear Pooling results
    cbp_xy = cbp(x, y)