def content_extractor(self, images, reuse=False):
     # images: (batch, 32, 32, 3) or (batch, 32, 32, 1)
     
     if images.get_shape()[3] == 1:
         # For mnist dataset, replicate the gray scale image 3 times.
         images = tf.image.grayscale_to_rgb(images)
     
     with tf.variable_scope('content_extractor', reuse=reuse):
         with slim.arg_scope([slim.conv2d], padding='SAME', activation_fn=None,
                              stride=2,  weights_initializer=tf.contrib.layers.xavier_initializer()):
             with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, 
                                 activation_fn=tf.nn.relu, is_training=(self.mode=='train' or self.mode=='pretrain')):
                 
                 net = slim.conv2d(images, 64, [3, 3], scope='conv1')   # (batch_size, 16, 16, 64)
                 net = slim.batch_norm(net, scope='bn1')
                 net = slim.conv2d(net, 128, [3, 3], scope='conv2')     # (batch_size, 8, 8, 128)
                 net = slim.batch_norm(net, scope='bn2')
                 net = slim.conv2d(net, 256, [3, 3], scope='conv3')     # (batch_size, 4, 4, 256)
                 net = slim.batch_norm(net, scope='bn3')
                 net = slim.conv2d(net, 128, [4, 4], padding='VALID', scope='conv4')   # (batch_size, 1, 1, 128)
                 net = slim.batch_norm(net, activation_fn=tf.nn.tanh, scope='bn4')
                 if self.mode == 'pretrain':
                     net = slim.conv2d(net, 10, [1, 1], padding='VALID', scope='out')
                     net = slim.flatten(net)
                 return net
def LResnet50E_IR(images, keep_probability, 
             phase_train=True, bottleneck_layer_size=512, 
             weight_decay=0.0, reuse=None):
    '''
    conv name
    conv[conv_layer]_[block_index]_[block_layer_index]
    
    for resnet50 n_units=[3,4,14,3], consider one unit is dim_reduction_layer
    repeat n_units=[2,3,13,2]
    '''
    with tf.variable_scope('Conv1'):
        net = slim.conv2d(images,64,scope='Conv1_pre')
        net = slim.batch_norm(net,scope='Conv1_bn')
    with tf.variable_scope('Conv2'):
        net = resface_block(net,64,stride=2,dim_match=False,scope='Conv2_pre')
        net = slim.repeat(net,2,resface_block,64,1,True,scope='Conv2_main')
    with tf.variable_scope('Conv3'):
        net = resface_block(net,128,stride=2,dim_match=False,scope='Conv3_pre')
        net = slim.repeat(net,3,resface_block,128,1,True,scope='Conv3_main')
    with tf.variable_scope('Conv4'):
        net = resface_block(net,256,stride=2,dim_match=False,scope='Conv4_pre')
        net = slim.repeat(net,13,resface_block,256,1,True,scope='Conv4_main')
    with tf.variable_scope('Conv5'):
        net = resface_block(net,512,stride=2,dim_match=False,scope='Conv5_pre')
        net = slim.repeat(net,2,resface_block,512,1,True,scope='Conv5_main')

    with tf.variable_scope('Logits'):
        net = slim.batch_norm(net,activation_fn=None,scope='bn1')
        net = slim.dropout(net, keep_probability, is_training=phase_train,scope='Dropout')        
        net = slim.flatten(net)
    
    net = slim.fully_connected(net, bottleneck_layer_size, biases_initializer=tf.contrib.layers.xavier_initializer(), scope='fc1')
    net = slim.batch_norm(net, activation_fn=None, scope='Bottleneck')

    return net,''
Exemple #3
0
  def _depthwise_separable_conv(inputs,
                                num_pwc_filters,
                                width_multiplier,
                                sc,
                                downsample=False):
    """ Helper function to build the depth-wise separable convolution layer.
    """
    num_pwc_filters = round(num_pwc_filters * width_multiplier)
    _stride = 2 if downsample else 1

    # skip pointwise by setting num_outputs=None
    depthwise_conv = slim.separable_convolution2d(inputs,
                                                  num_outputs=None,
                                                  stride=_stride,
                                                  depth_multiplier=1,
                                                  kernel_size=[3, 3],
                                                  scope=sc+'/depthwise_conv')

    bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm')
    pointwise_conv = slim.convolution2d(bn,
                                        num_pwc_filters,
                                        kernel_size=[1, 1],
                                        scope=sc+'/pointwise_conv')
    bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm')
    return bn
def resface_block(lower_input,output_channels,stride,dim_match=True,scope=None):
    with tf.variable_scope(scope):
        net = slim.batch_norm(lower_input, activation_fn=None,scope='bn1')
        net = slim.conv2d(net, output_channels)
        net = slim.batch_norm(net,scope='bn2')
        net = slim.conv2d(net, output_channels,stride=stride)
        net = slim.batch_norm(net, activation_fn=None,scope='bn3')

        if dim_match==True:
            short_cut = lower_input
        else:
            short_cut = slim.conv2d(lower_input, output_channels, stride=2, kernel_size=1)
            short_cut = slim.batch_norm(short_cut, activation_fn=None,scope='shortcut_bn')
        return short_cut + net
    def generator(self, inputs, reuse=False):
        # inputs: (batch, 1, 1, 128)
        with tf.variable_scope('generator', reuse=reuse):
            with slim.arg_scope([slim.conv2d_transpose], padding='SAME', activation_fn=None,           
                                 stride=2, weights_initializer=tf.contrib.layers.xavier_initializer()):
                with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, 
                                     activation_fn=tf.nn.relu, is_training=(self.mode=='train')):

                    net = slim.conv2d_transpose(inputs, 512, [4, 4], padding='VALID', scope='conv_transpose1')   # (batch_size, 4, 4, 512)
                    net = slim.batch_norm(net, scope='bn1')
                    net = slim.conv2d_transpose(net, 256, [3, 3], scope='conv_transpose2')  # (batch_size, 8, 8, 256)
                    net = slim.batch_norm(net, scope='bn2')
                    net = slim.conv2d_transpose(net, 128, [3, 3], scope='conv_transpose3')  # (batch_size, 16, 16, 128)
                    net = slim.batch_norm(net, scope='bn3')
                    net = slim.conv2d_transpose(net, 1, [3, 3], activation_fn=tf.nn.tanh, scope='conv_transpose4')   # (batch_size, 32, 32, 1)
                    return net
def generator(tensor):
    reuse = len([t for t in tf.global_variables() if t.name.startswith('generator')]) > 0
    print tensor.get_shape()
    with variable_scope.variable_scope('generator', reuse = reuse):
        tensor = slim.fully_connected(tensor, 1024)
        print tensor
        tensor = slim.batch_norm(tensor, activation_fn=tf.nn.relu)
        tensor = slim.fully_connected(tensor, 7*7*128)
        tensor = slim.batch_norm(tensor, activation_fn=tf.nn.relu)
        tensor = tf.reshape(tensor, [-1, 7, 7, 128])
        # print '22',tensor.get_shape()
        tensor = slim.conv2d_transpose(tensor, 64, kernel_size=[4,4], stride=2, activation_fn = None)
        print 'gen',tensor.get_shape()
        tensor = slim.batch_norm(tensor, activation_fn = tf.nn.relu)
        tensor = slim.conv2d_transpose(tensor, 1, kernel_size=[4, 4], stride=2, activation_fn=tf.nn.sigmoid)
    return tensor
 def discriminator(self, images, reuse=False):
     # images: (batch, 32, 32, 1)
     with tf.variable_scope('discriminator', reuse=reuse):
         with slim.arg_scope([slim.conv2d], padding='SAME', activation_fn=None,
                              stride=2,  weights_initializer=tf.contrib.layers.xavier_initializer()):
             with slim.arg_scope([slim.batch_norm], decay=0.95, center=True, scale=True, 
                                 activation_fn=tf.nn.relu, is_training=(self.mode=='train')):
                 
                 net = slim.conv2d(images, 128, [3, 3], activation_fn=tf.nn.relu, scope='conv1')   # (batch_size, 16, 16, 128)
                 net = slim.batch_norm(net, scope='bn1')
                 net = slim.conv2d(net, 256, [3, 3], scope='conv2')   # (batch_size, 8, 8, 256)
                 net = slim.batch_norm(net, scope='bn2')
                 net = slim.conv2d(net, 512, [3, 3], scope='conv3')   # (batch_size, 4, 4, 512)
                 net = slim.batch_norm(net, scope='bn3')
                 net = slim.conv2d(net, 1, [4, 4], padding='VALID', scope='conv4')   # (batch_size, 1, 1, 1)
                 net = slim.flatten(net)
                 return net
Exemple #8
0
    def forward(self, reshaped_input):
        """Forward pass of a Soft-DBoW block.

        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

        Returns:
        bof: the pooled vector of size: 'batch_size' x 'output_dim'
        """


        cluster_weights = tf.get_variable("cluster_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(
              stddev=1 / math.sqrt(self.feature_size)))
        
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [self.cluster_size],
            initializer = tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))
          activation += cluster_biases

        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation,
                [-1, self.max_samples, self.cluster_size])

        bof = tf.reduce_sum(activation,1)
        bof = tf.nn.l2_normalize(bof,1)

        hidden1_weights = tf.get_variable("hidden1_weights",
          [self.cluster_size, self.output_dim],
          initializer=tf.random_normal_initializer(
              stddev=1 / math.sqrt(self.cluster_size)))
           
        bof = tf.matmul(bof, hidden1_weights)

        if self.gating:
            bof = super(self.__class__, self).context_gating(bof)
      
        return bof
Exemple #9
0
def batch_norm(x, train, data_format='NHWC', name=None, act=lrelu, epsilon=1e-5, momentum=0.9):
    return slim.batch_norm(x,
                        decay=momentum,
                        updates_collections=None,
                        epsilon=epsilon,
                        scale=True,
                        fused=True,
                        is_training=train,
                        activation_fn=act,
                        data_format=data_format,
                        scope=name)
    def forward(self,reshaped_input):


        cluster_weights = tf.get_variable("cluster_weights",
              [self.feature_size, self.cluster_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
       
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases
        
        activation = tf.nn.softmax(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        cluster_weights2 = tf.get_variable("cluster_weights2",
            [1,self.feature_size, self.cluster_size],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        
        a = tf.multiply(a_sum,cluster_weights2)
        
        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
        vlad = tf.matmul(activation,reshaped_input)
        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.subtract(vlad,a)
        

        vlad = tf.nn.l2_normalize(vlad,1)

        vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
        vlad = tf.nn.l2_normalize(vlad,1)

        return vlad
Exemple #11
0
  def _depthwise_separable_conv(inputs,
                                num_pwc_filters,
                                sc,
                                kernel_size,
                                stride):
    """ Helper function to build the depth-wise separable convolution layer.
    """

    # skip pointwise by setting num_outputs=None
    depthwise_conv = slim.separable_convolution2d(inputs,
                                                  num_outputs=None,
                                                  stride=stride,
                                                  depth_multiplier=1,
                                                  kernel_size=kernel_size,
                                                  scope=sc+'/depthwise_conv')

    bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm')
    pointwise_conv = slim.convolution2d(bn,
                                        num_pwc_filters,
                                        kernel_size=[1, 1],
                                        scope=sc+'/pointwise_conv')
    bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm')
    return bn
    def forward(self, reshaped_input):

        feature_size = self.feature_size
        cluster_size = self.cluster_size
        add_batch_norm = self.add_batch_norm
        max_frames = self.max_frames
        is_training = self.is_training

        cluster_weights = tf.get_variable("cluster_weights",
          [feature_size, cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
        
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases

        if activation == 'glu':
            space_ind = range(cluster_size/2)
            gate_ind = range(cluster_size/2,cluster_size)

            gates = tf.sigmoid(activation[:,gate_ind])
            activation = tf.multiply(activation[:,space_ind],gates)

        elif activation == 'relu':
            activation = tf.nn.relu6(activation)
        
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])

        avg_activation = utils.FramePooling(activation, 'average')
        avg_activation = tf.nn.l2_normalize(avg_activation,1)

        max_activation = utils.FramePooling(activation, 'max')
        max_activation = tf.nn.l2_normalize(max_activation,1)
        
        return tf.concat([avg_activation,max_activation],1)
Exemple #13
0
 def batchnorm(self, layer, inp):
     if not self.var:
         temp = (inp - layer.w['moving_mean'])
         temp /= (np.sqrt(layer.w['moving_variance']) + 1e-5)
         temp *= layer.w['gamma']
         return temp
     else:
         args = dict({
             'center' : False, 'scale' : True,
             'epsilon': 1e-5, 'scope' : self.scope,
             'updates_collections' : None,
             'is_training': layer.h['is_training'],
             'param_initializers': layer.w
             })
         return slim.batch_norm(inp, **args)
    def forward(self,reshaped_input):


        cluster_weights = tf.get_variable("cluster_weights",
              [self.feature_size, self.cluster_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
       
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        
        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])

        gate_weights = tf.get_variable("gate_weights",
            [1, self.cluster_size,self.feature_size],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        
        gate_weights = tf.sigmoid(gate_weights)

        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])

        vlagd = tf.matmul(activation,reshaped_input)
        vlagd = tf.multiply(vlagd,gate_weights)

        vlagd = tf.transpose(vlagd,perm=[0,2,1])
        
        vlagd = tf.nn.l2_normalize(vlagd,1)

        vlagd = tf.reshape(vlagd,[-1,self.cluster_size*self.feature_size])
        vlagd = tf.nn.l2_normalize(vlagd,1)

        return vlagd
    def forward(self, reshaped_input):

        feature_size = self.feature_size
        cluster_size = self.cluster_size
        add_batch_norm = self.add_batch_norm
        max_frames = self.max_frames
        is_training = self.is_training
        max_pool = self.max_pool

        cluster_weights = tf.get_variable("cluster_weights",
          [feature_size, cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
        
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases

        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])

        activation_sum = tf.reduce_sum(activation,1)
        activation_sum = tf.nn.l2_normalize(activation_sum,1)

        if max_pool:
            activation_max = tf.reduce_max(activation,1)
            activation_max = tf.nn.l2_normalize(activation_max,1)
            activation = tf.concat([activation_sum,activation_max],1)
        else:
            activation = activation_sum
        
        return activation
Exemple #16
0
def custom_residual_block(x, neurons, kernel_size, stride, name, is_training,
                          wt_decay=0.0001, use_residual=True,
                          residual_stride_conv=True, conv_fn=slim.conv2d,
                          batch_norm_param=None):
  
  # batch norm x and relu
  init_var = np.sqrt(2.0/(kernel_size**2)/neurons)
  with arg_scope([conv_fn], 
                 weights_regularizer=slim.l2_regularizer(wt_decay),
                 weights_initializer=tf.random_normal_initializer(stddev=init_var),
                 biases_initializer=tf.zeros_initializer()): 
    
    if batch_norm_param is None:
      batch_norm_param = {'center': True, 'scale': False, 
                          'activation_fn':tf.nn.relu, 
                          'is_training': is_training}
    
    y = slim.batch_norm(x, scope=name+'_bn', **batch_norm_param)

    y = conv_fn(y, num_outputs=neurons, kernel_size=kernel_size, stride=stride,
                activation_fn=None, scope=name+'_1',
                normalizer_fn=slim.batch_norm,
                normalizer_params=batch_norm_param)
    
    y = conv_fn(y, num_outputs=neurons, kernel_size=kernel_size,
                    stride=1, activation_fn=None, scope=name+'_2')

    if use_residual:
      if stride != 1 or x.get_shape().as_list()[-1] != neurons:
        batch_norm_param_ = dict(batch_norm_param)
        batch_norm_param_['activation_fn'] = None
        x = conv_fn(x, num_outputs=neurons, kernel_size=1,
                        stride=stride if residual_stride_conv else 1,
                        activation_fn=None, scope=name+'_0_1x1',
                        normalizer_fn=slim.batch_norm,
                        normalizer_params=batch_norm_param_)
        if not residual_stride_conv:
          x = slim.avg_pool2d(x, 1, stride=stride, scope=name+'_0_avg')
  
      y = tf.add(x, y, name=name+'_add')
    
    return y
Exemple #17
0
def batchnorm(bottom, is_train, num_reference, epsilon=1e-3, decay=0.999, name=None):
    """ virtual batch normalization (poor man's version)
    the first half is the true batch, the second half is the reference batch.
    When num_reference = 0, it is just typical batch normalization.  
    To use virtual batch normalization in test phase, "update_popmean.py" needed to be executed first 
    (in order to store the mean and variance of the reference batch into pop_mean and pop_variance of batchnorm.)
    """

    batch_size = bottom.get_shape().as_list()[0]
    inst_size = batch_size - num_reference
    instance_weight = np.ones([batch_size])

    if inst_size > 0:
        reference_weight = 1.0 - (1.0 / ( num_reference + 1.0))
        instance_weight[0:inst_size] = 1.0 - reference_weight
        instance_weight[inst_size:] = reference_weight
    else:
        decay = 0.0

    return slim.batch_norm(bottom, activation_fn=None, is_training=is_train, decay=decay, scale=True, scope=name, batch_weights=instance_weight)
Exemple #18
0
    def context_gating(self, input_layer):
        """Context Gating

        Args:
        input_layer: Input layer in the following shape:
        'batch_size' x 'number_of_activation'

        Returns:
        activation: gated layer in the following shape:
        'batch_size' x 'number_of_activation'
        """

        input_dim = input_layer.get_shape().as_list()[1] 
        
        gating_weights = tf.get_variable("gating_weights",
          [input_dim, input_dim],
          initializer = tf.random_normal_initializer(
          stddev=1 / math.sqrt(input_dim)))
        
        gates = tf.matmul(input_layer, gating_weights)
 
        if self.add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [input_dim],
            initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(input_dim)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(input_layer,gates)

        return activation
Exemple #19
0
    def forward(self, reshaped_input):
        """Forward pass of a NetRVLAD block.

        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

        Returns:
        vlad: the pooled vector of size: 'batch_size' x 'output_dim'
        """


        cluster_weights = tf.get_variable("cluster_weights",
              [self.feature_size, self.cluster_size],
              initializer = tf.random_normal_initializer(
                  stddev=1 / math.sqrt(self.feature_size)))
       
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [self.cluster_size],
            initializer = tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases
        
        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation,
                [-1, self.max_samples, self.cluster_size])
       
        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,[-1,
            self.max_samples, self.feature_size])
        vlad = tf.matmul(activation,reshaped_input)
        
        vlad = tf.transpose(vlad,perm=[0,2,1])
        vlad = tf.nn.l2_normalize(vlad,1)

        vlad = tf.reshape(vlad,[-1,self.cluster_size*self.feature_size])
        vlad = tf.nn.l2_normalize(vlad,1)

        hidden1_weights = tf.get_variable("hidden1_weights",
          [self.cluster_size*self.feature_size, self.output_dim],
          initializer=tf.random_normal_initializer(
              stddev=1 / math.sqrt(self.cluster_size)))
           
        vlad = tf.matmul(vlad, hidden1_weights)

        if self.gating:
            vlad = super(self.__class__, self).context_gating(vlad)


        return vlad
Exemple #20
0
    def forward(self, reshaped_input):

        cluster_weights = tf.get_variable(
            "cluster_weights", [self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))
        cluster_weights_vlad = tf.get_variable(
            "cluster_weights_vlad", [self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))

        activation = tf.matmul(
            reshaped_input,
            cluster_weights)  # None (None * max_frames) x cluster_size
        activation_vlad = tf.matmul(
            reshaped_input,
            cluster_weights_vlad)  # None (None * max_frames) x cluster_size

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=self.is_training,
                                     scope="cluster_bn")
        activation_vlad = slim.batch_norm(activation_vlad,
                                          center=True,
                                          scale=True,
                                          is_training=self.is_training,
                                          scope="cluster_bn_vlad")

        activation = tf.nn.softmax(activation)
        activation_vlad = tf.nn.softmax(activation_vlad)

        activation = tf.reshape(activation,
                                [-1, self.max_frames, self.cluster_size
                                 ])  # None x max_frames x cluster_size
        activation_vlad = tf.reshape(activation_vlad,
                                     [-1, self.max_frames, self.cluster_size
                                      ])  # None x max_frames x cluster_size

        ### only to vlad ###
        a_sum = tf.reduce_sum(activation_vlad, -2,
                              keep_dims=True)  # None x 1 x cluster_size
        cluster_weights2 = tf.get_variable(
            "cluster_weights_vlad_2",
            [1, self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)
            ))  # 1 x feature_size x cluster_size
        a = tf.multiply(a_sum,
                        cluster_weights2)  # None x feature_size x cluster_size
        ### only to vlad ###

        activation = tf.transpose(activation,
                                  perm=[0, 2,
                                        1])  # None x cluster_size x max_frame
        activation_vlad = tf.transpose(
            activation_vlad, perm=[0, 2, 1])  # None x cluster_size x max_frame

        reshaped_input = tf.reshape(reshaped_input,
                                    [-1, self.max_frames, self.feature_size
                                     ])  # None x max_frame x feature_size

        ### only to light ###
        lightvlad = tf.matmul(
            activation, reshaped_input)  # None x cluster_size x feature_size
        lightvlad = tf.transpose(lightvlad,
                                 perm=[0, 2, 1
                                       ])  # None x feature_size x cluster_size
        ### only to light ###

        ### only to vlad ###
        vlad = tf.matmul(activation_vlad,
                         reshaped_input)  # None x cluster_size x feature_size
        vlad = tf.transpose(vlad,
                            perm=[0, 2,
                                  1])  # None x feature_size x cluster_size
        vlad = tf.subtract(vlad, a)
        ### only to vlad ###

        vlad_final = vlad + lightvlad

        vlad_final = tf.nn.l2_normalize(vlad_final, 1)
        vlad_final = tf.reshape(vlad_final,
                                [-1, self.cluster_size * self.feature_size])
        vlad_final = tf.nn.l2_normalize(vlad_final, 1)

        return vlad_final
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.netvlad_cluster_size
    hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
    relu = FLAGS.netvlad_relu
    dimred = FLAGS.netvlad_dimred
    gating = FLAGS.gating
    remove_diag = FLAGS.gating_remove_diag
    lightvlad = FLAGS.lightvlad
    vlagd = FLAGS.vlagd

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    

    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])

    if lightvlad:
      video_NetVLAD = LightVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = LightVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
    elif vlagd:
      video_NetVLAD = NetVLAGD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = NetVLAGD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
    else:
      video_NetVLAD = NetVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = NetVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)

  
    if add_batch_norm:# and not lightvlad:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_VLAD"):
        vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_VLAD"):
        vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])

    vlad = tf.concat([vlad_video, vlad_audio],1)

    vlad_dim = vlad.get_shape().as_list()[1] 
    hidden1_weights = tf.get_variable("hidden1_weights",
      [vlad_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
       
    activation = tf.matmul(vlad, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")

    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases
   
    if relu:
      activation = tf.nn.relu6(activation)
   

    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
        
        gates = tf.matmul(activation, gating_weights)
 
        if remove_diag:
            #removes diagonals coefficients
            diagonals = tf.matrix_diag_part(gating_weights)
            gates = gates - tf.multiply(diagonals,activation)

       
        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)


    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.fv_cluster_size
    hidden1_size = hidden_size or FLAGS.fv_hidden_size
    relu = FLAGS.fv_relu
    gating = FLAGS.gating

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    video_NetFV = NetFV(1024,max_frames,cluster_size, add_batch_norm, is_training)
    audio_NetFV = NetFV(128,max_frames,cluster_size/2, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_FV"):
        fv_video = video_NetFV.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_FV"):
        fv_audio = audio_NetFV.forward(reshaped_input[:,1024:])

    fv = tf.concat([fv_video, fv_audio],1)

    fv_dim = fv.get_shape().as_list()[1] 
    hidden1_weights = tf.get_variable("hidden1_weights",
      [fv_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
    
    activation = tf.matmul(fv, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")
    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases
   
    if relu:
      activation = tf.nn.relu6(activation)

    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
        
        gates = tf.matmul(activation, gating_weights)
        
        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)


    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
def bottleneck(inputs,
               depth,
               depth_bottleneck,
               stride,
               rate=1,
               residual_mask=None,
               scope=None):
    with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc:
        flops = 0

        depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4)
        preact = slim.batch_norm(inputs,
                                 activation_fn=tf.nn.relu,
                                 scope='preact')
        if depth == depth_in:
            shortcut = resnet_utils.subsample(inputs, stride, 'shortcut')
        else:
            shortcut, current_flops = flopsometer.conv2d(preact,
                                                         depth, [1, 1],
                                                         stride=stride,
                                                         normalizer_fn=None,
                                                         activation_fn=None,
                                                         scope='shortcut')
            flops += current_flops

        if residual_mask is not None:
            # Max-pooling trick only works correctly when stride is 1.
            # We assume that stride=2 happens in the first layer where
            # residual_mask is None.
            assert stride == 1
            diluted_residual_mask = slim.max_pool2d(residual_mask, [3, 3],
                                                    stride=1,
                                                    padding='SAME')
        else:
            diluted_residual_mask = None

        residual, current_flops = flopsometer.conv2d(
            preact,
            depth_bottleneck, [1, 1],
            stride=1,
            output_mask=diluted_residual_mask,
            scope='conv1')
        flops += current_flops

        residual, current_flops = flopsometer.conv2d_same(
            residual,
            depth_bottleneck,
            3,
            stride,
            rate=rate,
            output_mask=residual_mask,
            scope='conv2')
        flops += current_flops

        residual, current_flops = flopsometer.conv2d(residual,
                                                     depth, [1, 1],
                                                     stride=1,
                                                     normalizer_fn=None,
                                                     activation_fn=None,
                                                     output_mask=residual_mask,
                                                     scope='conv3')
        flops += current_flops

        if residual_mask is not None:
            residual *= residual_mask

        outputs = shortcut + residual

        return outputs, flops
Exemple #24
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        if is_training:
            iterations = iterations or DBoFConfig.train_iterations
        else:
            iterations = iterations or DBoFConfig.eval_iterations
        add_batch_norm = add_batch_norm or DBoFConfig.dbof_add_batch_norm
        random_frames = sample_random_frames or DBoFConfig.sample_random_frames
        cluster_size = cluster_size or DBoFConfig.dbof_cluster_size
        hidden1_size = hidden_size or DBoFConfig.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.Variable(
            tf.random_normal([feature_size, cluster_size],
                             stddev=1 / math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.Variable(
                tf.random_normal([cluster_size],
                                 stddev=1 / math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = utils.FramePooling(activation,
                                        DBoFConfig.dbof_pooling_method)

        hidden1_weights = tf.Variable(
            tf.random_normal([cluster_size, hidden1_size],
                             stddev=1 / math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.Variable(
                tf.random_normal([hidden1_size], stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)

        aggregated_model = getattr(video_level_models,
                                   DBoFConfig.video_level_classifier_model)
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Exemple #25
0
def create_ds_cnn_model(fingerprint_input, model_settings, model_size_info,
                        is_training):
    """Builds a model with depthwise separable convolutional neural network
    Model definition is based on https://arxiv.org/abs/1704.04861 and
    Tensorflow implementation: https://github.com/Zehaos/MobileNet
    model_size_info: defines number of layers, followed by the DS-Conv layer
      parameters in the order {number of conv features, conv filter height, 
      width and stride in y,x dir.} for each of the layers. 
    Note that first layer is always regular convolution, but the remaining 
      layers are all depthwise separable convolutions.
    """
    def ds_cnn_arg_scope(weight_decay=0):
        """Defines the default ds_cnn argument scope.
        Args:
          weight_decay: The weight decay to use for regularizing the model.
        Returns:
          An `arg_scope` to use for the DS-CNN model.
        """
        with slim.arg_scope(
            [slim.convolution2d, slim.separable_convolution2d],
                weights_initializer=slim.initializers.xavier_initializer(),
                biases_initializer=slim.init_ops.zeros_initializer(),
                weights_regularizer=slim.l2_regularizer(weight_decay)) as sc:
            return sc

    def _depthwise_separable_conv(inputs, num_pwc_filters, sc, kernel_size,
                                  stride):
        """ Helper function to build the depth-wise separable convolution layer.
        """

        # skip pointwise by setting num_outputs=None
        depthwise_conv = slim.separable_convolution2d(inputs,
                                                      num_outputs=None,
                                                      stride=stride,
                                                      depth_multiplier=1,
                                                      kernel_size=kernel_size,
                                                      scope=sc +
                                                      '/depthwise_conv')

        bn = slim.batch_norm(depthwise_conv, scope=sc + '/dw_batch_norm')
        pointwise_conv = slim.convolution2d(bn,
                                            num_pwc_filters,
                                            kernel_size=[1, 1],
                                            scope=sc + '/pointwise_conv')
        bn = slim.batch_norm(pointwise_conv, scope=sc + '/pw_batch_norm')
        return bn

    if is_training:
        dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')

    label_count = model_settings['label_count']
    input_frequency_size = model_settings['dct_coefficient_count']
    input_time_size = model_settings['spectrogram_length']

    fingerprint_4d = tf.reshape(fingerprint_input,
                                [-1, input_time_size, input_frequency_size, 1])

    t_dim = input_time_size
    f_dim = input_frequency_size

    # Extract model dimensions from model_size_info
    num_layers = model_size_info[0]
    conv_feat = [None] * num_layers
    conv_kt = [None] * num_layers
    conv_kf = [None] * num_layers
    conv_st = [None] * num_layers
    conv_sf = [None] * num_layers
    i = 1
    for layer_no in range(0, num_layers):
        conv_feat[layer_no] = model_size_info[i]
        i += 1
        conv_kt[layer_no] = model_size_info[i]
        i += 1
        conv_kf[layer_no] = model_size_info[i]
        i += 1
        conv_st[layer_no] = model_size_info[i]
        i += 1
        conv_sf[layer_no] = model_size_info[i]
        i += 1

    scope = 'DS-CNN'
    with tf.variable_scope(scope) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope(
            [slim.convolution2d, slim.separable_convolution2d],
                activation_fn=None,
                weights_initializer=slim.initializers.xavier_initializer(),
                biases_initializer=slim.init_ops.zeros_initializer(),
                outputs_collections=[end_points_collection]):
            with slim.arg_scope([slim.batch_norm],
                                is_training=is_training,
                                decay=0.96,
                                updates_collections=None,
                                activation_fn=tf.nn.relu):
                for layer_no in range(0, num_layers):
                    if layer_no == 0:
                        net = slim.convolution2d(fingerprint_4d, conv_feat[layer_no], \
                                                 [conv_kt[layer_no], conv_kf[layer_no]],
                                                 stride=[conv_st[layer_no], conv_sf[layer_no]], padding='SAME',
                                                 scope='conv_1')
                        net = slim.batch_norm(net, scope='conv_1/batch_norm')
                    else:
                        net = _depthwise_separable_conv(net, conv_feat[layer_no], \
                                                        kernel_size=[conv_kt[layer_no], conv_kf[layer_no]], \
                                                        stride=[conv_st[layer_no], conv_sf[layer_no]],
                                                        sc='conv_ds_' + str(layer_no))
                    t_dim = math.ceil(t_dim / float(conv_st[layer_no]))
                    f_dim = math.ceil(f_dim / float(conv_sf[layer_no]))

                net = slim.avg_pool2d(net, [t_dim, f_dim], scope='avg_pool')

        net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
        logits = slim.fully_connected(net,
                                      label_count,
                                      activation_fn=None,
                                      scope='fc1')

    if is_training:
        return logits, dropout_prob
    else:
        return logits
Exemple #26
0
    def forward(self, reshaped_input):
        """Forward pass of a NetVLAD block.

        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

        Returns:
        vlad: the pooled vector of size: 'batch_size' x 'output_dim'
        """

        cluster_weights = tf.get_variable(
            "cluster_weights", [self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))

        activation = tf.matmul(reshaped_input, cluster_weights)

        # activation = tf.contrib.layers.batch_norm(activation,
        #         center=True, scale=True,
        #         is_training=self.is_training,
        #         scope='cluster_bn')

        # activation = slim.batch_norm(
        #       activation,
        #       center=True,
        #       scale=True,
        #       is_training=self.is_training,
        #       scope="cluster_bn")

        if self.add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_training,
                                         scope="cluster_bn",
                                         fused=False)
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [self.cluster_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(self.feature_size)))
            activation += cluster_biases

        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation,
                                [-1, self.max_samples, self.cluster_size])

        a_sum = tf.reduce_sum(activation, -2, keep_dims=True)

        cluster_weights2 = tf.get_variable(
            "cluster_weights2", [1, self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))

        a = tf.multiply(a_sum, cluster_weights2)

        activation = tf.transpose(activation, perm=[0, 2, 1])

        reshaped_input = tf.reshape(reshaped_input,
                                    [-1, self.max_samples, self.feature_size])

        vlad = tf.matmul(activation, reshaped_input)
        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.subtract(vlad, a)

        vlad = tf.nn.l2_normalize(vlad, 1)

        vlad = tf.reshape(vlad, [-1, self.cluster_size * self.feature_size])
        vlad = tf.nn.l2_normalize(vlad, 1)

        hidden1_weights = tf.get_variable(
            "hidden1_weights",
            [self.cluster_size * self.feature_size, self.output_dim],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.cluster_size)))

        ##Tried using dropout
        #vlad=tf.layers.dropout(vlad,rate=0.5,training=self.is_training)

        vlad = tf.matmul(vlad, hidden1_weights)

        ##Added a batch norm
        vlad = tf.contrib.layers.batch_norm(vlad,
                                            center=True,
                                            scale=True,
                                            is_training=self.is_training,
                                            scope='bn')

        if self.gating:
            vlad = super(self.__class__, self).context_gating(vlad)

        return vlad
Exemple #27
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = 300
        add_batch_norm = True
        random_frames = True
        cluster_size = 2048
        hidden1_size = 1024
        fc_dimred = True
        relu = False
        max_pool = False

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        tf.summary.histogram("input_hist", reshaped_input)

        video_Dbof = GatedDBoF(1024, max_frames, cluster_size, max_pool,
                               add_batch_norm, is_training)
        audio_Dbof = SoftDBoF(128, max_frames, cluster_size / 8, max_pool,
                              add_batch_norm, is_training)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        with tf.variable_scope("video_DBOF"):
            dbof_video = video_Dbof.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_DBOF"):
            dbof_audio = audio_Dbof.forward(reshaped_input[:, 1024:])

        dbof = tf.concat([dbof_video, dbof_audio], 1)

        dbof_dim = dbof.get_shape().as_list()[1]

        if fc_dimred:
            hidden1_weights = tf.get_variable(
                "hidden1_weights", [dbof_dim, hidden1_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(cluster_size)))
            tf.summary.histogram("hidden1_weights", hidden1_weights)
            activation = tf.matmul(dbof, hidden1_weights)

            if add_batch_norm and relu:
                activation = slim.batch_norm(activation,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="hidden1_bn")
            else:
                hidden1_biases = tf.get_variable(
                    "hidden1_biases", [hidden1_size],
                    initializer=tf.random_normal_initializer(stddev=0.01))
                tf.summary.histogram("hidden1_biases", hidden1_biases)
                activation += hidden1_biases

            if relu:
                activation = tf.nn.relu6(activation)
            tf.summary.histogram("hidden1_output", activation)
        else:
            activation = dbof

        aggregated_model = getattr(video_level_models,
                                   'willow_MoeModel_moe4_noGP')

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #28
0
 def data_bn_layer(self, x, in_channels):
     if self.data_bn is True:
         return slim.batch_norm(x)
     else:
         return x
Exemple #29
0
def nm(x):
    w0 = tf.Variable(1.0, name='w0')
    w1 = tf.Variable(0.0, name='w1')
    return w0 * x + w1 * slim.batch_norm(x)
Exemple #30
0
  def create_model(self,
                   model_input,
                   vocab_size,
                   is_training,
                   num_mixtures=None,
                   l2_penalty=1e-8,
                   **unused_params):
    """Creates a Mixture of (Logistic) Experts model.
     It also includes the possibility of gating the probabilities

     The model consists of a per-class softmax distribution over a
     configurable number of logistic classifiers. One of the classifiers in the
     mixture is not trained, and always predicts 0.

    Args:
      model_input: 'batch_size' x 'num_features' matrix of input features.
      vocab_size: The number of classes in the dataset.
      is_training: Is this the training phase ?
      num_mixtures: The number of mixtures (excluding a dummy 'expert' that
        always predicts the non-existence of an entity).
      l2_penalty: How much to penalize the squared magnitudes of parameter
        values.
    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      batch_size x num_classes.
    """
    num_mixtures = num_mixtures or FLAGS.moe_num_mixtures
    low_rank_gating = FLAGS.moe_low_rank_gating
    l2_penalty = FLAGS.moe_l2;
    gating_probabilities = FLAGS.moe_prob_gating
    gating_input = FLAGS.moe_prob_gating_input

    input_size = model_input.get_shape().as_list()[1]
    remove_diag = FLAGS.gating_remove_diag

    if low_rank_gating == -1:
        gate_activations = slim.fully_connected(
            model_input,
            vocab_size * (num_mixtures + 1),
            activation_fn=None,
            biases_initializer=None,
            weights_regularizer=slim.l2_regularizer(l2_penalty),
            scope="gates")
    else:
       gate_activations1 = slim.fully_connected(
            model_input,
            low_rank_gating,
            activation_fn=None,
            biases_initializer=None,
            weights_regularizer=slim.l2_regularizer(l2_penalty),
            scope="gates1")
       gate_activations = slim.fully_connected(
            gate_activations1,
            vocab_size * (num_mixtures + 1),
            activation_fn=None,
            biases_initializer=None,
            weights_regularizer=slim.l2_regularizer(l2_penalty),
            scope="gates2")


    expert_activations = slim.fully_connected(
        model_input,
        vocab_size * num_mixtures,
        activation_fn=None,
        weights_regularizer=slim.l2_regularizer(l2_penalty),
        scope="experts")

    gating_distribution = tf.nn.softmax(tf.reshape(
        gate_activations,
        [-1, num_mixtures + 1]))  # (Batch * #Labels) x (num_mixtures + 1)
    expert_distribution = tf.nn.sigmoid(tf.reshape(
        expert_activations,
        [-1, num_mixtures]))  # (Batch * #Labels) x num_mixtures

    probabilities_by_class_and_batch = tf.reduce_sum(
        gating_distribution[:, :num_mixtures] * expert_distribution, 1)
    probabilities = tf.reshape(probabilities_by_class_and_batch,
                                     [-1, vocab_size])

    if gating_probabilities:
        if gating_input == 'prob':
            gating_weights = tf.get_variable("gating_prob_weights",
              [vocab_size, vocab_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
            gates = tf.matmul(probabilities, gating_weights)
        else:
            gating_weights = tf.get_variable("gating_prob_weights",
              [input_size, vocab_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(vocab_size)))
 
            gates = tf.matmul(model_input, gating_weights)
        
        if remove_diag:
            #removes diagonals coefficients
            diagonals = tf.matrix_diag_part(gating_weights)
            gates = gates - tf.multiply(diagonals,probabilities)

        gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_prob_bn")

        gates = tf.sigmoid(gates)

        probabilities = tf.multiply(probabilities,gates)


    return {"predictions": probabilities}
Exemple #31
0
def main(args):

    #network = importlib.import_module(args.model_def)

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    if not os.path.isdir(
            log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
    if not os.path.isdir(
            model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    # Store some git revision info in a text file in the log directory
    src_path, _ = os.path.split(os.path.realpath(__file__))
    utils.store_revision_info(src_path, log_dir, ' '.join(sys.argv))

    np.random.seed(seed=args.seed)

    train_set = utils.get_dataset(args.data_dir)
    #train_set = utils.dataset_from_list(args.data_dir,args.list_file)
    nrof_classes = len(train_set)
    print('nrof_classes: ', nrof_classes)
    image_list, label_list = utils.get_image_paths_and_labels(train_set)
    print('total images: ', len(image_list))
    image_list = np.array(image_list)
    label_list = np.array(label_list, dtype=np.int32)

    dataset_size = len(image_list)
    single_batch_size = args.people_per_batch * args.images_per_person
    indices = list(range(dataset_size))
    np.random.shuffle(indices)

    def _sample_people_softmax(x):
        global softmax_ind
        if softmax_ind >= dataset_size:
            np.random.shuffle(indices)
            softmax_ind = 0
        true_num_batch = min(single_batch_size, dataset_size - softmax_ind)

        sample_paths = image_list[indices[softmax_ind:softmax_ind +
                                          true_num_batch]]
        sample_labels = label_list[indices[softmax_ind:softmax_ind +
                                           true_num_batch]]

        softmax_ind += true_num_batch

        return (np.array(sample_paths), np.array(sample_labels,
                                                 dtype=np.int32))

    def _sample_people(x):
        '''We sample people based on tf.data, where we can use transform and prefetch.

        '''

        image_paths, num_per_class = sample_people(
            train_set, args.people_per_batch * (args.num_gpus - 1),
            args.images_per_person)
        labels = []
        for i in range(len(num_per_class)):
            labels.extend([i] * num_per_class[i])
        return (np.array(image_paths), np.array(labels, dtype=np.int32))

    def _parse_function(filename, label):
        file_contents = tf.read_file(filename)
        image = tf.image.decode_image(file_contents, channels=3)
        #image = tf.image.decode_jpeg(file_contents, channels=3)
        print(image.shape)

        if args.random_crop:
            print('use random crop')
            image = tf.random_crop(image,
                                   [args.image_height, args.image_width, 3])
        else:
            print('Not use random crop')
            #image.set_shape((args.image_size, args.image_size, 3))
            image.set_shape((None, None, 3))
            image = tf.image.resize_images(image,
                                           size=(args.image_height,
                                                 args.image_width))
            #print(image.shape)
        if args.random_flip:
            image = tf.image.random_flip_left_right(image)

        #pylint: disable=no-member
        #image.set_shape((args.image_size, args.image_size, 3))
        image.set_shape((args.image_height, args.image_width, 3))
        if debug:
            image = tf.cast(image, tf.float32)
        else:
            image = tf.cast(image, tf.float32)
            image = tf.subtract(image, 127.5)
            image = tf.div(image, 128.)
            #image = tf.image.per_image_standardization(image)
        return image, label

    print('Model directory: %s' % model_dir)
    print('Log directory: %s' % log_dir)
    if args.pretrained_model:
        print('Pre-trained model: %s' %
              os.path.expanduser(args.pretrained_model))

    with tf.Graph().as_default():
        tf.set_random_seed(args.seed)
        global_step = tf.Variable(0, trainable=False, name='global_step')

        # Placeholder for the learning rate
        learning_rate_placeholder = tf.placeholder(tf.float32,
                                                   name='learning_rate')

        phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')

        #the image is generated by sequence
        with tf.device("/cpu:0"):

            softmax_dataset = tf_data.Dataset.range(args.epoch_size *
                                                    args.max_nrof_epochs * 100)
            softmax_dataset = softmax_dataset.map(lambda x: tf.py_func(
                _sample_people_softmax, [x], [tf.string, tf.int32]))
            softmax_dataset = softmax_dataset.flat_map(_from_tensor_slices)
            softmax_dataset = softmax_dataset.map(_parse_function,
                                                  num_parallel_calls=8)
            softmax_dataset = softmax_dataset.batch(args.num_gpus *
                                                    single_batch_size)
            softmax_iterator = softmax_dataset.make_initializable_iterator()
            softmax_next_element = softmax_iterator.get_next()
            softmax_next_element[0].set_shape(
                (args.num_gpus * single_batch_size, args.image_height,
                 args.image_width, 3))
            softmax_next_element[1].set_shape(args.num_gpus *
                                              single_batch_size)
            batch_image_split = tf.split(softmax_next_element[0],
                                         args.num_gpus)
            batch_label_split = tf.split(softmax_next_element[1],
                                         args.num_gpus)

        learning_rate = tf.train.exponential_decay(
            learning_rate_placeholder,
            global_step,
            args.learning_rate_decay_epochs * args.epoch_size,
            args.learning_rate_decay_factor,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        print('Using optimizer: {}'.format(args.optimizer))
        if args.optimizer == 'ADAGRAD':
            opt = tf.train.AdagradOptimizer(learning_rate)
        elif args.optimizer == 'MOM':
            opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
        elif args.optimizer == 'ADAM':
            opt = tf.train.AdamOptimizer(learning_rate,
                                         beta1=0.9,
                                         beta2=0.999,
                                         epsilon=0.1)
        else:
            raise Exception("Not supported optimizer: {}".format(
                args.optimizer))
        tower_losses = []
        tower_cross = []
        tower_dist = []
        tower_reg = []
        for i in range(args.num_gpus):
            with tf.device("/gpu:" + str(i)):
                with tf.name_scope("tower_" + str(i)) as scope:
                    with slim.arg_scope([slim.model_variable, slim.variable],
                                        device="/cpu:0"):
                        with tf.variable_scope(
                                tf.get_variable_scope()) as var_scope:
                            reuse = False if i == 0 else True
                            #with slim.arg_scope(resnet_v2.resnet_arg_scope(args.weight_decay)):
                            #prelogits, end_points = resnet_v2.resnet_v2_50(batch_image_split[i],is_training=True,
                            #        output_stride=16,num_classes=args.embedding_size,reuse=reuse)
                            #prelogits, end_points = network.inference(batch_image_split[i], args.keep_probability,
                            #    phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size,
                            #    weight_decay=args.weight_decay, reuse=reuse)
                            if args.network == 'sphere_network':
                                prelogits = network.infer(
                                    batch_image_split[i], args.embedding_size)
                                print(prelogits)
                            elif args.network == 'resface':
                                prelogits, _ = resface.inference(
                                    batch_image_split[i],
                                    1.0,
                                    bottleneck_layer_size=args.embedding_size,
                                    weight_decay=args.weight_decay,
                                    reuse=reuse)
                            elif args.network == 'inception_net':
                                prelogits, endpoints = inception_net.inference(
                                    batch_image_split[i],
                                    1,
                                    phase_train=True,
                                    bottleneck_layer_size=args.embedding_size,
                                    weight_decay=args.weight_decay,
                                    reuse=reuse)
                                print(prelogits)

                            elif args.network == 'resnet_v2':
                                with slim.arg_scope(
                                        resnet_v2.resnet_arg_scope(
                                            args.weight_decay)):
                                    prelogits, end_points = resnet_v2.resnet_v2_50(
                                        batch_image_split[i],
                                        is_training=True,
                                        output_stride=16,
                                        num_classes=args.embedding_size,
                                        reuse=reuse)
                                    prelogits = tf.squeeze(prelogits,
                                                           axis=[1, 2])
                            elif args.network == 'mobilenet':
                                prelogits, net_points = mobilenet.inference(
                                    batch_image_split[i],
                                    bottleneck_layer_size=args.embedding_size,
                                    phase_train=True,
                                    weight_decay=args.weight_decay,
                                    reuse=reuse)

                            else:
                                raise Exception(
                                    "Not supported network: {}".format(
                                        args.network))
                            if args.fc_bn:

                                prelogits = slim.batch_norm(
                                    prelogits,
                                    is_training=True,
                                    decay=0.997,
                                    epsilon=1e-5,
                                    scale=True,
                                    updates_collections=tf.GraphKeys.
                                    UPDATE_OPS,
                                    reuse=reuse,
                                    scope='softmax_bn')
                            if args.loss_type == 'softmax':
                                cross_entropy_mean = utils.softmax_loss(
                                    prelogits, batch_label_split[i],
                                    len(train_set), args.weight_decay, reuse)
                                regularization_losses = tf.get_collection(
                                    tf.GraphKeys.REGULARIZATION_LOSSES)
                                tower_cross.append(cross_entropy_mean)
                                #loss = cross_entropy_mean + args.weight_decay*tf.add_n(regularization_losses)
                                loss = cross_entropy_mean + tf.add_n(
                                    regularization_losses)
                                #tower_dist.append(0)
                                #tower_cross.append(cross_entropy_mean)
                                #tower_th.append(0)
                                tower_losses.append(loss)
                                tower_reg.append(regularization_losses)
                            elif args.loss_type == 'cosface':
                                label_reshape = tf.reshape(
                                    batch_label_split[i], [single_batch_size])
                                label_reshape = tf.cast(
                                    label_reshape, tf.int64)
                                coco_loss = utils.cos_loss(prelogits,
                                                           label_reshape,
                                                           len(train_set),
                                                           reuse,
                                                           alpha=args.alpha,
                                                           scale=args.scale)
                                #scatter_loss, _ = facenet.coco_loss(prelogits,label_reshape, len(train_set),reuse,alpha=args.alpha,scale=args.scale)
                                #coco_loss = scatter_loss['loss_total']
                                regularization_losses = tf.get_collection(
                                    tf.GraphKeys.REGULARIZATION_LOSSES)
                                if args.network == 'sphere_network':
                                    print(
                                        'reg loss using weight_decay * tf.add_n'
                                    )
                                    reg_loss = args.weight_decay * tf.add_n(
                                        regularization_losses)
                                else:
                                    print('reg loss using tf.add_n')
                                    reg_loss = tf.get_collection(
                                        tf.GraphKeys.REGULARIZATION_LOSSES)
                                loss = coco_loss + reg_loss

                                tower_losses.append(loss)
                                tower_reg.append(reg_loss)

                            #loss = tf.add_n([cross_entropy_mean] + regularization_losses, name='total_loss')
                            tf.get_variable_scope().reuse_variables()
        total_loss = tf.reduce_mean(tower_losses)
        total_reg = tf.reduce_mean(tower_reg)
        losses = {}
        losses['total_loss'] = total_loss
        losses['total_reg'] = total_reg

        grads = opt.compute_gradients(total_loss,
                                      tf.trainable_variables(),
                                      colocate_gradients_with_ops=True)
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = tf.group(apply_gradient_op)

        save_vars = [
            var for var in tf.global_variables()
            if 'Adagrad' not in var.name and 'global_step' not in var.name
        ]

        #saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3)
        saver = tf.train.Saver(save_vars, max_to_keep=3)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                                allow_soft_placement=True))

        # Initialize variables
        sess.run(tf.global_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})
        sess.run(tf.local_variables_initializer(),
                 feed_dict={phase_train_placeholder: True})

        #sess.run(iterator.initializer)
        sess.run(softmax_iterator.initializer)

        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord, sess=sess)

        with sess.as_default():
            #pdb.set_trace()

            if args.pretrained_model:
                print('Restoring pretrained model: %s' % args.pretrained_model)
                saver.restore(sess, os.path.expanduser(args.pretrained_model))

            # Training and validation loop
            epoch = 0
            while epoch < args.max_nrof_epochs:
                step = sess.run(global_step, feed_dict=None)
                epoch = step // args.epoch_size
                if debug:
                    debug_train(args, sess, train_set, epoch,
                                image_batch_gather, enqueue_op,
                                batch_size_placeholder, image_batch_split,
                                image_paths_split, num_per_class_split,
                                image_paths_placeholder,
                                image_paths_split_placeholder,
                                labels_placeholder, labels_batch,
                                num_per_class_placeholder,
                                num_per_class_split_placeholder, len(gpus))
                # Train for one epoch
                train(args, sess, epoch, learning_rate_placeholder,
                      phase_train_placeholder, global_step, losses, train_op,
                      summary_op, summary_writer,
                      args.learning_rate_schedule_file)

                # Save variables and the metagraph if it doesn't exist already
                save_variables_and_metagraph(sess, saver, summary_writer,
                                             model_dir, subdir, step)

    return model_dir
Exemple #32
0
def inference(inputs, num_classes, n):
    """
        total layers 6n+2
    :param inputs:
    :param num_classes:
    :param n:
    :return:
    """
    with slim.arg_scope(arg_scope()):
        net = slim.batch_norm(inputs)
        net = slim.conv2d(net, 16, [3, 3])
        with tf.variable_scope('residual_bolck1'):
            for i in range(n):
                with tf.variable_scope('residual_bolck1_%d' % i):
                    res = net
                    net = slim.batch_norm(net)
                    net = slim.conv2d(net, 16, [3, 3])
                    net = slim.batch_norm(net)
                    net = slim.conv2d(net, 16, [3, 3])
                    net = net + res

        with tf.variable_scope('residual_bolck2'):
            for i in range(n):
                with tf.name_scope('residual_bolck2_%d' % i):
                    res = net
                    net = slim.batch_norm(net)
                    if i == 0:
                        net = slim.conv2d(net, 32, [3, 3], stride=2)
                    else:
                        net = slim.conv2d(net, 32, [3, 3])
                    net = slim.batch_norm(net)
                    net = slim.conv2d(net, 32, [3, 3])
                    if i == 0:
                        res = slim.avg_pool2d(res, [2, 2])
                        net = net + tf.pad(res,
                                           [[0, 0], [0, 0], [0, 0], [8, 8]])
                    else:
                        net = net + res

        with tf.variable_scope('residual_bolck3'):
            for i in range(n):
                with tf.variable_scope('residual_bolck3_%d' % i):
                    res = net
                    net = slim.batch_norm(net)
                    if i == 0:
                        net = slim.conv2d(net, 64, [3, 3], stride=2)
                    else:
                        net = slim.conv2d(net, 64, [3, 3])
                    net = slim.batch_norm(net)
                    net = slim.conv2d(net, 64, [3, 3])
                    if i == 0:
                        res = slim.avg_pool2d(res, [2, 2])
                        net = net + tf.pad(res,
                                           [[0, 0], [0, 0], [0, 0], [16, 16]])
                    else:
                        net = net + res
                    net = slim.batch_norm(net)
        assert net.get_shape().as_list()[1:] == [8, 8, 64]
        with tf.variable_scope('fully_connected'):
            net = tf.reduce_mean(net, [1, 2])
            net = slim.flatten(net)
            logits = slim.fully_connected(net, num_classes)
            # net = slim.dropout(net, keep_prob=0.9, scope='dropout')
        return logits
Exemple #33
0
    def forward(self, reshaped_input):
        """Forward pass of a NetFV block.

        Args:
        reshaped_input: If your input is in that form:
        'batch_size' x 'max_samples' x 'feature_size'
        It should be reshaped in the following form:
        'batch_size*max_samples' x 'feature_size'
        by performing:
        reshaped_input = tf.reshape(input, [-1, features_size])

        Returns:
        fv: the pooled vector of size: 'batch_size' x 'output_dim'
        """

        cluster_weights = tf.get_variable("cluster_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(
              stddev=1 / math.sqrt(self.feature_size)))
     
        covar_weights = tf.get_variable("covar_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(
              mean=1.0, stddev=1 /math.sqrt(self.feature_size)))
      
        covar_weights = tf.square(covar_weights)
        eps = tf.constant([1e-6])
        covar_weights = tf.add(covar_weights,eps)

        activation = tf.matmul(reshaped_input, cluster_weights)
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [self.cluster_size],
            initializer = tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))
          activation += cluster_biases
        
        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation,
                [-1, self.max_samples, self.cluster_size])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        cluster_weights2 = tf.get_variable("cluster_weights2",
                [1,self.feature_size, self.cluster_size],
                initializer = tf.random_normal_initializer(
                    stddev=1 / math.sqrt(self.feature_size)))

        a = tf.multiply(a_sum,cluster_weights2)
        
        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,
                [-1,self.max_samples,self.feature_size])
        fv1 = tf.matmul(activation,reshaped_input)
        
        fv1 = tf.transpose(fv1,perm=[0,2,1])

        # computing second order FV
        a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) 

        b2 = tf.multiply(fv1,cluster_weights2) 
        fv2 = tf.matmul(activation,tf.square(reshaped_input)) 
     
        fv2 = tf.transpose(fv2,perm=[0,2,1])
        fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)])

        fv2 = tf.divide(fv2,tf.square(covar_weights))
        fv2 = tf.subtract(fv2,a_sum)

        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
      
        fv2 = tf.nn.l2_normalize(fv2,1)
        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
        fv2 = tf.nn.l2_normalize(fv2,1)

        fv1 = tf.subtract(fv1,a)
        fv1 = tf.divide(fv1,covar_weights) 

        fv1 = tf.nn.l2_normalize(fv1,1)
        fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size])
        fv1 = tf.nn.l2_normalize(fv1,1)

        fv = tf.concat([fv1,fv2],1)
 
        hidden1_weights = tf.get_variable("hidden1_weights",
          [2*self.cluster_size*self.feature_size, self.output_dim],
          initializer=tf.random_normal_initializer(
              stddev=1 / math.sqrt(self.cluster_size)))
           
        fv = tf.matmul(fv, hidden1_weights)

        if self.gating:
            fv = super(self.__class__, self).context_gating(fv)
 
        return fv 
 def batch_norm_fn(x):
     return slim.batch_norm(x, scope=tf.get_variable_scope().name + "/bn")
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.fv_cluster_size
    hidden1_size = hidden_size or FLAGS.fv_hidden_size
    relu = FLAGS.fv_relu
    gating = FLAGS.gating

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    video_NetFV = NetFV(1024,max_frames,cluster_size, add_batch_norm, is_training)
    audio_NetFV = NetFV(128,max_frames,cluster_size/2, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_FV"):
        fv_video = video_NetFV.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_FV"):
        fv_audio = audio_NetFV.forward(reshaped_input[:,1024:])

    fv = tf.concat([fv_video, fv_audio],1)

    fv_dim = fv.get_shape().as_list()[1] 
    hidden1_weights = tf.get_variable("hidden1_weights",
      [fv_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
    
    activation = tf.matmul(fv, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")
    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases
   
    if relu:
      activation = tf.nn.relu6(activation)

    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
        
        gates = tf.matmul(activation, gating_weights)
        
        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)


    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     is_training=True,
                     l2_penalty=1e-8,
                     **unused_params):

        num_layers = 3
        lstm_size = 900
        activation_proj_dim = int(lstm_size * 1.18)
        pool_size = 2
        num_filters = [128, 128]
        filter_sizes = [1, 3]
        features_size = int(sum(num_filters))
        self.is_training = is_training

        cnn_input = model_input

        cnn_max_frames = model_input.get_shape().as_list()[1]

        lstm_memories = []

        for layer in range(num_layers):

            if layer > 0:
                cnn_output = self.cnn(cnn_input,
                                      num_filters=num_filters,
                                      filter_sizes=filter_sizes,
                                      sub_scope="cnn%d" % (layer + 1))
                tf.summary.histogram("cnn_output_{}".format(layer), cnn_output)

                cnn_output = slim.batch_norm(cnn_output,
                                             center=True,
                                             scale=True,
                                             is_training=self.is_training,
                                             scope="cnn_output_bn_layer_" +
                                             str(layer))
                tf.summary.histogram(
                    "cnn_output_after_bn_before_tanh_{}".format(layer),
                    cnn_output)
            else:
                cnn_output = slim.batch_norm(cnn_input,
                                             center=True,
                                             scale=True,
                                             is_training=self.is_training,
                                             scope="cnn_output_bn_layer_" +
                                             str(layer))
                tf.summary.histogram(
                    "cnn_output_after_bn_before_tanh_{}".format(layer),
                    cnn_output)

            cnn_output_tanh = tf.nn.tanh(cnn_output)
            tf.summary.histogram(
                "cnn_output_after_bn_after_tanh_{}".format(layer),
                cnn_output_tanh)

            lstm_memory = self.rnn(cnn_output_tanh,
                                   lstm_size,
                                   num_frames,
                                   sub_scope="rnn%d" %
                                   (layer + 1))  # None x lstm_size
            tf.summary.histogram("lstm_memory_{}".format(layer), lstm_memory)

            lstm_memory = tf.nn.l2_normalize(lstm_memory, 1)
            tf.summary.histogram("lstm_memory_after_l2Norm_{}".format(layer),
                                 lstm_memory)

            lstm_memories.append(lstm_memory)

            max_pooled_cnn_output = tf.layers.max_pooling1d(cnn_output_tanh,
                                                            pool_size=3,
                                                            strides=2,
                                                            padding='same')

            # for the next cnn layer
            cnn_input = max_pooled_cnn_output
            num_frames = tf.maximum(num_frames / pool_size, 1)

        concat_lstm_memory = tf.concat(lstm_memories, 1)
        concat_lstm_memory = tf.nn.l2_normalize(concat_lstm_memory, 1)
        print("\n\n\nconcat_lstm_memory size: {} \n\n\n".format(
            concat_lstm_memory.get_shape()))

        vlad_dim = concat_lstm_memory.get_shape().as_list()[1]

        concat_lstm_memory_weights = tf.get_variable(
            "concat_lstm_memory_weights", [vlad_dim, activation_proj_dim],
            initializer=tf.glorot_uniform_initializer())

        activation = tf.matmul(concat_lstm_memory,
                               concat_lstm_memory_weights)  # None x lstm_size

        concat_lstm_memory_biases = tf.get_variable(
            "concat_lstm_memory_biases", [activation_proj_dim],
            initializer=tf.random_normal_initializer(stddev=0.01))
        activation += concat_lstm_memory_biases

        ## gating
        gating_weights = tf.get_variable(
            "gating_weights_2", [activation_proj_dim, activation_proj_dim],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(activation_proj_dim)))

        gates = tf.matmul(activation, gating_weights)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=self.is_training,
                                scope="activation_gating_bn")

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation, gates)
        tf.summary.histogram("activation_before_video_model", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=self.is_training,
                                               **unused_params)
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 2 and isinstance(args[1], str)
    dataset_name = args[1]
    logger.info('Using dataset: {}'.format(dataset_name))

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)
    dataset_size = get_dataset_size_train(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(dataset_name, is_train=True, epochs=cfg.epoch)

    with tf.Graph().as_default(), tf.device('/cpu:0'):
        """Get global_step."""
        global_step = tf.get_variable(
            'global_step', [], initializer=tf.constant_initializer(0), trainable=False)

        """Get batches per epoch."""
        num_batches_per_epoch = int(dataset_size / cfg.batch_size)

        """Use exponential decay leanring rate?"""
        lrn_rate = tf.maximum(tf.train.exponential_decay(
            1e-3, global_step, num_batches_per_epoch, 0.8), 1e-5)
        tf.summary.scalar('learning_rate', lrn_rate)
        opt = tf.train.AdamOptimizer()  # lrn_rate

        """Get batch from data queue."""
        batch_x, batch_labels = create_inputs()
        # batch_y = tf.one_hot(batch_labels, depth=10, axis=1, dtype=tf.float32)

        """Define the dataflow graph."""
        with tf.device('/gpu:0'):
            with slim.arg_scope([slim.variable], device='/cpu:0'):
                batch_x_squash = tf.divide(batch_x, 255.)
                batch_x = slim.batch_norm(batch_x, center=False, is_training=True, trainable=True)
                output = net.build_arch_baseline(batch_x, is_train=True,
                                                 num_classes=num_classes)
                loss, recon_loss, _ = net.cross_ent_loss(output, batch_x_squash, batch_labels)
                acc = net.test_accuracy(output, batch_labels)
                tf.summary.scalar('train_acc', acc)
                tf.summary.scalar('recon_loss', recon_loss)
                tf.summary.scalar('all_loss', loss)

            """Compute gradient."""
            grad = opt.compute_gradients(loss)
            # See: https://stackoverflow.com/questions/40701712/how-to-check-nan-in-gradients-in-tensorflow-when-updating
            grad_check = [tf.check_numerics(g, message='Gradient NaN Found!')
                          for g, _ in grad if g is not None] + [tf.check_numerics(loss, message='Loss NaN Found')]

        """Apply graident."""
        with tf.control_dependencies(grad_check):
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            with tf.control_dependencies(update_ops):
                train_op = opt.apply_gradients(grad, global_step=global_step)

        """Set Session settings."""
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))
        sess.run(tf.local_variables_initializer())
        sess.run(tf.global_variables_initializer())

        """Set Saver."""
        var_to_save = [v for v in tf.global_variables(
        ) if 'Adam' not in v.name]  # Don't save redundant Adam beta/gamma
        saver = tf.train.Saver(var_list=var_to_save, max_to_keep=cfg.epoch)

        """Display parameters"""
        total_p = np.sum([np.prod(v.get_shape().as_list()) for v in var_to_save]).astype(np.int32)
        train_p = np.sum([np.prod(v.get_shape().as_list())
                          for v in tf.trainable_variables()]).astype(np.int32)
        logger.info('Total Parameters: {}'.format(total_p))
        logger.info('Trainable Parameters: {}'.format(train_p))

        # read snapshot
        # latest = os.path.join(cfg.logdir, 'model.ckpt-4680')
        # saver.restore(sess, latest)
        """Set summary op."""
        summary_op = tf.summary.merge_all()

        """Start coord & queue."""
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        """Set summary writer"""
        if not os.path.exists(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name)):
            os.makedirs(cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name))
        summary_writer = tf.summary.FileWriter(
            cfg.logdir + '/cnn_baseline/{}/train_log/'.format(dataset_name), graph=sess.graph)

        """Main loop."""
        for step in range(cfg.epoch * num_batches_per_epoch + 1):
            tic = time.time()
            """"TF queue would pop batch until no file"""
            try:
                _, loss_value, summary_str = sess.run(
                    [train_op, loss, summary_op])
                logger.info('%d iteration finishs in ' % step + '%f second' %
                            (time.time() - tic) + ' loss=%f' % loss_value)
            except KeyboardInterrupt:
                sess.close()
                sys.exit()
            except tf.errors.InvalidArgumentError:
                logger.warning('%d iteration contains NaN gradients. Discard.' % step)
                continue
            else:
                """Write to summary."""
                if step % 5 == 0:
                    summary_writer.add_summary(summary_str, step)

                """Epoch wise linear annealling."""
                if (step % num_batches_per_epoch) == 0:

                    """Save model periodically"""
                    ckpt_path = os.path.join(
                        cfg.logdir + '/cnn_baseline/{}'.format(dataset_name), 'model-{:.4f}.ckpt'.format(loss_value))
                    saver.save(sess, ckpt_path, global_step=step)

        """Join threads"""
        coord.join(threads)
    def forward(self, reshaped_input):

        feature_size = self.feature_size
        cluster_size = self.cluster_size
        add_batch_norm = self.add_batch_norm
        max_frames = self.max_frames
        is_training = self.is_training
        max_pool = self.max_pool

        cluster_weights = tf.get_variable("cluster_weights",
          [feature_size, cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
        
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        
        if add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases

        activation = tf.nn.softmax(activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])

        activation_sum = tf.reduce_sum(activation,1)
        
        activation_max = tf.reduce_max(activation,1)
        activation_max = tf.nn.l2_normalize(activation_max,1)


        dim_red = tf.get_variable("dim_red",
          [cluster_size, feature_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
 
        cluster_weights_2 = tf.get_variable("cluster_weights_2",
          [feature_size, cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
        
        tf.summary.histogram("cluster_weights_2", cluster_weights_2)
        
        activation = tf.matmul(activation_max, dim_red)
        activation = tf.matmul(activation, cluster_weights_2)
        
        if add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="cluster_bn_2")
        else:
          cluster_biases = tf.get_variable("cluster_biases_2",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          tf.summary.histogram("cluster_biases_2", cluster_biases)
          activation += cluster_biases

        activation = tf.sigmoid(activation)

        activation = tf.multiply(activation,activation_sum)
        activation = tf.nn.l2_normalize(activation,1)

        return activation
Exemple #39
0
def create_ds_cnn_model(fingerprint_input, model_settings, model_size_info, 
                          is_training):
  """Builds a model with depthwise separable convolutional neural network
  Model definition is based on https://arxiv.org/abs/1704.04861 and
  Tensorflow implementation: https://github.com/Zehaos/MobileNet

  model_size_info: defines number of layers, followed by the DS-Conv layer
    parameters in the order {number of conv features, conv filter height, 
    width and stride in y,x dir.} for each of the layers. 
  Note that first layer is always regular convolution, but the remaining 
    layers are all depthwise separable convolutions.
  """

  def ds_cnn_arg_scope(weight_decay=0):
    """Defines the default ds_cnn argument scope.
    Args:
      weight_decay: The weight decay to use for regularizing the model.
    Returns:
      An `arg_scope` to use for the DS-CNN model.
    """
    with slim.arg_scope(
        [slim.convolution2d, slim.separable_convolution2d],
        weights_initializer=slim.initializers.xavier_initializer(),
        biases_initializer=slim.init_ops.zeros_initializer(),
        weights_regularizer=slim.l2_regularizer(weight_decay)) as sc:
      return sc

  def _depthwise_separable_conv(inputs,
                                num_pwc_filters,
                                sc,
                                kernel_size,
                                stride):
    """ Helper function to build the depth-wise separable convolution layer.
    """

    # skip pointwise by setting num_outputs=None
    depthwise_conv = slim.separable_convolution2d(inputs,
                                                  num_outputs=None,
                                                  stride=stride,
                                                  depth_multiplier=1,
                                                  kernel_size=kernel_size,
                                                  scope=sc+'/depthwise_conv')

    bn = slim.batch_norm(depthwise_conv, scope=sc+'/dw_batch_norm')
    pointwise_conv = slim.convolution2d(bn,
                                        num_pwc_filters,
                                        kernel_size=[1, 1],
                                        scope=sc+'/pointwise_conv')
    bn = slim.batch_norm(pointwise_conv, scope=sc+'/pw_batch_norm')
    return bn


  if is_training:
    dropout_prob = tf.placeholder(tf.float32, name='dropout_prob')

  label_count = model_settings['label_count']
  input_frequency_size = model_settings['dct_coefficient_count']
  input_time_size = model_settings['spectrogram_length']
  fingerprint_4d = tf.reshape(fingerprint_input,
                              [-1, input_time_size, input_frequency_size, 1])
 
  t_dim = input_time_size
  f_dim = input_frequency_size

  #Extract model dimensions from model_size_info
  num_layers = model_size_info[0]
  conv_feat = [None]*num_layers
  conv_kt = [None]*num_layers
  conv_kf = [None]*num_layers
  conv_st = [None]*num_layers
  conv_sf = [None]*num_layers
  i=1
  for layer_no in range(0,num_layers):
    conv_feat[layer_no] = model_size_info[i]
    i += 1
    conv_kt[layer_no] = model_size_info[i]
    i += 1
    conv_kf[layer_no] = model_size_info[i]
    i += 1
    conv_st[layer_no] = model_size_info[i]
    i += 1
    conv_sf[layer_no] = model_size_info[i]
    i += 1

  scope = 'DS-CNN'
  with tf.variable_scope(scope) as sc:
    end_points_collection = sc.name + '_end_points'
    with slim.arg_scope([slim.convolution2d, slim.separable_convolution2d],
                        activation_fn=None,
                        weights_initializer=slim.initializers.xavier_initializer(),
                        biases_initializer=slim.init_ops.zeros_initializer(),
                        outputs_collections=[end_points_collection]):
      with slim.arg_scope([slim.batch_norm],
                          is_training=is_training,
                          decay=0.96,
                          updates_collections=None,
                          activation_fn=tf.nn.relu):
        for layer_no in range(0,num_layers):
          if layer_no==0:
            net = slim.convolution2d(fingerprint_4d, conv_feat[layer_no],\
                      [conv_kt[layer_no], conv_kf[layer_no]], stride=[conv_st[layer_no], conv_sf[layer_no]], padding='SAME', scope='conv_1')
            net = slim.batch_norm(net, scope='conv_1/batch_norm')
          else:
            net = _depthwise_separable_conv(net, conv_feat[layer_no], \
                      kernel_size = [conv_kt[layer_no],conv_kf[layer_no]], \
                      stride = [conv_st[layer_no],conv_sf[layer_no]], sc='conv_ds_'+str(layer_no))
          t_dim = math.ceil(t_dim/float(conv_st[layer_no]))
          f_dim = math.ceil(f_dim/float(conv_sf[layer_no]))

        net = slim.avg_pool2d(net, [t_dim, f_dim], scope='avg_pool')

    net = tf.squeeze(net, [1, 2], name='SpatialSqueeze')
    logits = slim.fully_connected(net, label_count, activation_fn=None, scope='fc1')

  if is_training:
    return logits, dropout_prob
  else:
    return logits
    def forward(self,reshaped_input):
        cluster_weights = tf.get_variable("cluster_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
     
        covar_weights = tf.get_variable("covar_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(mean=1.0, stddev=1 /math.sqrt(self.feature_size)))
      
        covar_weights = tf.square(covar_weights)
        eps = tf.constant([1e-6])
        covar_weights = tf.add(covar_weights,eps)

        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [self.cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases
        
        activation = tf.nn.softmax(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        if not FLAGS.fv_couple_weights:
            cluster_weights2 = tf.get_variable("cluster_weights2",
              [1,self.feature_size, self.cluster_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        else:
            cluster_weights2 = tf.scalar_mul(FLAGS.fv_coupling_factor,cluster_weights)

        a = tf.multiply(a_sum,cluster_weights2)
        
        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
        fv1 = tf.matmul(activation,reshaped_input)
        
        fv1 = tf.transpose(fv1,perm=[0,2,1])

        # computing second order FV
        a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) 

        b2 = tf.multiply(fv1,cluster_weights2) 
        fv2 = tf.matmul(activation,tf.square(reshaped_input)) 
     
        fv2 = tf.transpose(fv2,perm=[0,2,1])
        fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)])

        fv2 = tf.divide(fv2,tf.square(covar_weights))
        fv2 = tf.subtract(fv2,a_sum)

        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
      
        fv2 = tf.nn.l2_normalize(fv2,1)
        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
        fv2 = tf.nn.l2_normalize(fv2,1)

        fv1 = tf.subtract(fv1,a)
        fv1 = tf.divide(fv1,covar_weights) 

        fv1 = tf.nn.l2_normalize(fv1,1)
        fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size])
        fv1 = tf.nn.l2_normalize(fv1,1)

        return tf.concat([fv1,fv2],1)
Exemple #41
0
    def _build_planner(self, scaled_beliefs, m={}):
        debug = self._debug
        is_training = self._is_training
        batch_size = tf.shape(scaled_beliefs[0])[0]
        image_scaler = self._upscale_image
        estimate_size = self._estimate_size
        value_map_size = (estimate_size, estimate_size, 1)
        num_actions = self._num_actions
        num_iterations = self._num_iterations

        def _fuse_belief(belief):
            with slim.arg_scope(
                [slim.conv2d],
                    activation_fn=tf.nn.elu,
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=1),
                    biases_initializer=tf.constant_initializer(0),
                    stride=1,
                    padding='SAME',
                    reuse=tf.AUTO_REUSE):
                net = slim.conv2d(belief, 1, [1, 1], scope='fuser_combine')
                return net

        class HierarchicalVINCell(tf.nn.rnn_cell.RNNCell):
            @property
            def state_size(self):
                return tf.TensorShape(value_map_size)

            @property
            def output_size(self):
                return self.state_size

            def __call__(self, inputs, state, scope=None):
                # Upscale previous value map
                state = image_scaler(state)

                estimate, _, values = [
                    tf.expand_dims(layer, axis=3)
                    for layer in tf.unstack(inputs, axis=3)
                ]
                with slim.arg_scope([slim.conv2d], reuse=tf.AUTO_REUSE):
                    rewards_map = _fuse_belief(
                        tf.concat([estimate, values, state], axis=3))
                    actions_map = slim.conv2d(
                        rewards_map,
                        num_actions, [3, 3],
                        weights_initializer=tf.truncated_normal_initializer(
                            stddev=0.42),
                        biases_initializer=tf.constant_initializer(0),
                        scope='VIN_actions_initial')
                    values_map = tf.reduce_max(actions_map,
                                               axis=3,
                                               keep_dims=True)

                with slim.arg_scope([slim.conv2d], reuse=tf.AUTO_REUSE):
                    for i in xrange(num_iterations - 1):
                        rv = tf.concat([rewards_map, values_map], axis=3)
                        actions_map = slim.conv2d(
                            rv,
                            num_actions, [3, 3],
                            weights_initializer=tf.
                            truncated_normal_initializer(stddev=0.42),
                            biases_initializer=tf.constant_initializer(0),
                            scope='VIN_actions')
                        values_map = tf.reduce_max(actions_map,
                                                   axis=3,
                                                   keep_dims=True)

                return values_map, values_map

        beliefs = tf.stack([
            slim.batch_norm(belief, is_training=is_training)
            for belief in scaled_beliefs
        ],
                           axis=1)
        vin_cell = HierarchicalVINCell()
        interm_values_map, final_values_map = tf.nn.dynamic_rnn(
            vin_cell,
            beliefs,
            initial_state=vin_cell.zero_state(batch_size, tf.float32),
            swap_memory=True)
        m['value_map'] = interm_values_map

        values_features = slim.flatten(final_values_map)
        actions_logit = slim.fully_connected(
            values_features,
            num_actions**2,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.03),
            biases_initializer=tf.constant_initializer(0),
            activation_fn=tf.nn.elu,
            scope='logit_output_1')
        actions_logit = slim.fully_connected(
            actions_logit,
            num_actions,
            weights_initializer=tf.truncated_normal_initializer(stddev=0.5),
            biases_initializer=tf.constant_initializer(1.0 / num_actions),
            scope='logit_output_2')

        return actions_logit
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.netvlad_cluster_size
    hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
    relu = FLAGS.netvlad_relu
    dimred = FLAGS.netvlad_dimred
    gating = FLAGS.gating
    remove_diag = FLAGS.gating_remove_diag
    lightvlad = FLAGS.lightvlad
    vlagd = FLAGS.vlagd

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    

    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])

    if lightvlad:
      video_NetVLAD = LightVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = LightVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
    elif vlagd:
      video_NetVLAD = NetVLAGD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = NetVLAGD(128,max_frames,cluster_size/2, add_batch_norm, is_training)
    else:
      video_NetVLAD = NetVLAD(1024,max_frames,cluster_size, add_batch_norm, is_training)
      audio_NetVLAD = NetVLAD(128,max_frames,cluster_size/2, add_batch_norm, is_training)

  
    if add_batch_norm:# and not lightvlad:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_VLAD"):
        vlad_video = video_NetVLAD.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_VLAD"):
        vlad_audio = audio_NetVLAD.forward(reshaped_input[:,1024:])

    vlad = tf.concat([vlad_video, vlad_audio],1)

    vlad_dim = vlad.get_shape().as_list()[1] 
    hidden1_weights = tf.get_variable("hidden1_weights",
      [vlad_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
       
    activation = tf.matmul(vlad, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")

    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases
   
    if relu:
      activation = tf.nn.relu6(activation)
   

    if gating:
        gating_weights = tf.get_variable("gating_weights_2",
          [hidden1_size, hidden1_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(hidden1_size)))
        
        gates = tf.matmul(activation, gating_weights)
 
        if remove_diag:
            #removes diagonals coefficients
            diagonals = tf.matrix_diag_part(gating_weights)
            gates = gates - tf.multiply(diagonals,activation)

       
        if add_batch_norm:
          gates = slim.batch_norm(
              gates,
              center=True,
              scale=True,
              is_training=is_training,
              scope="gating_bn")
        else:
          gating_biases = tf.get_variable("gating_biases",
            [cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
          gates += gating_biases

        gates = tf.sigmoid(gates)

        activation = tf.multiply(activation,gates)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)


    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
Exemple #43
0
    def _build_mapper(self, m={}, estimator=None):
        debug = self._debug
        is_training = self._is_training
        sequence_length = self._sequence_length
        visual_input = self._visual_input
        egomotion = self._egomotion
        reward = self._reward
        estimate_map = self._estimate_map_list
        estimate_scale = self._estimate_scale
        estimate_shape = self._estimate_shape

        def _estimate(image):
            def _xavier_init(num_in, num_out):
                stddev = np.sqrt(4. / (num_in + num_out))
                return tf.truncated_normal_initializer(stddev=stddev)

            def _constrain_confidence(belief):
                estimate, confidence = tf.unstack(belief, axis=3)
                return tf.stack([estimate, tf.nn.sigmoid(confidence)], axis=3)

            beliefs = []
            net = image

            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected, slim.conv2d_transpose],
                    activation_fn=tf.nn.elu,
                    biases_initializer=tf.constant_initializer(0),
                    reuse=tf.AUTO_REUSE):
                last_output_channels = 3

                with slim.arg_scope([slim.conv2d], stride=1, padding='VALID'):
                    for index, output in enumerate([(32, [7, 7]), (48, [7, 7]),
                                                    (64, [5, 5]), (64, [5,
                                                                        5])]):
                        channels, filter_size = output
                        net = slim.conv2d(net,
                                          channels,
                                          filter_size,
                                          scope='mapper_conv_{}'.format(index),
                                          weights_initializer=_xavier_init(
                                              np.prod(filter_size) *
                                              last_output_channels, channels))
                        last_output_channels = channels

                    net = slim.fully_connected(
                        net,
                        200,
                        scope='mapper_fc',
                        weights_initializer=_xavier_init(
                            last_output_channels, 200))
                    last_output_channels = 200

                with slim.arg_scope([slim.conv2d_transpose],
                                    stride=1,
                                    padding='SAME'):
                    for index, output in enumerate((64, 32, 2)):
                        net = slim.conv2d_transpose(
                            net,
                            output, [7, 7],
                            scope='mapper_deconv_{}'.format(index),
                            weights_initializer=_xavier_init(
                                7 * 7 * last_output_channels, output))
                        last_output_channels = output

                    beliefs.append(net)
                    for i in xrange(estimate_scale - 1):
                        net = slim.conv2d_transpose(
                            net,
                            2, [6, 6],
                            weights_initializer=_xavier_init(
                                6 * 6 * last_output_channels, 2),
                            scope='mapper_upscale_{}'.format(i))
                        last_output_channels = 2
                        beliefs.append(self._upscale_image(net))

            return [_constrain_confidence(belief) for belief in beliefs]

        def _apply_egomotion(tensor, scale_index, ego):
            translation, rotation = tf.unstack(ego, axis=1)

            cos_rot = tf.cos(rotation)
            sin_rot = tf.sin(rotation)
            zero = tf.zeros_like(rotation)
            scale = tf.constant(
                (2**scale_index) / (300. / self._estimate_size),
                dtype=tf.float32)

            transform = tf.stack([
                cos_rot, sin_rot,
                tf.multiply(tf.negative(translation), scale),
                tf.negative(sin_rot), cos_rot, zero, zero, zero
            ],
                                 axis=1)
            return tf.contrib.image.transform(tensor,
                                              transform,
                                              interpolation='BILINEAR')

        def _delta_reward_map(reward):
            h, w, c = estimate_shape
            m_h, m_w = int((h - 1) / 2), int((w - 1) / 2)

            return tf.pad(
                tf.expand_dims(reward, axis=2),
                tf.constant([[0, 0], [m_h - 1, w - m_h], [m_w - 1, w - m_w]]))

        def _warp(temp_belief, prev_belief):
            temp_estimate, temp_confidence, temp_rewards = tf.unstack(
                temp_belief, axis=3)
            prev_estimate, prev_confidence, prev_rewards = tf.unstack(
                prev_belief, axis=3)

            current_confidence = temp_confidence + prev_confidence
            current_estimate = tf.divide(
                tf.multiply(temp_estimate, temp_confidence) +
                tf.multiply(prev_estimate, prev_confidence),
                current_confidence)
            current_rewards = temp_rewards + prev_rewards
            current_belief = tf.stack(
                [current_estimate, current_confidence, current_rewards],
                axis=3)
            return current_belief

        class BiLinearSamplingCell(tf.nn.rnn_cell.RNNCell):
            @property
            def state_size(self):
                return [tf.TensorShape(estimate_shape)] * estimate_scale

            @property
            def output_size(self):
                return self.state_size

            def __call__(self, inputs, state, scope=None):
                image, ego, re = inputs

                delta_reward_map = tf.expand_dims(_delta_reward_map(re),
                                                  axis=3)

                current_scaled_estimates = _estimate(
                    image) if estimator is None else estimator(image)
                current_scaled_estimates = [
                    tf.concat([estimate, delta_reward_map], axis=3)
                    for estimate in current_scaled_estimates
                ]
                previous_scaled_estimates = [
                    _apply_egomotion(belief, scale_index, ego)
                    for scale_index, belief in enumerate(state)
                ]
                outputs = [
                    _warp(c, p) for c, p in zip(current_scaled_estimates,
                                                previous_scaled_estimates)
                ]

                return outputs, outputs

        normalized_input = slim.batch_norm(visual_input,
                                           is_training=is_training)
        bilinear_cell = BiLinearSamplingCell()
        interm_beliefs, final_belief = tf.nn.dynamic_rnn(
            bilinear_cell,
            (normalized_input, egomotion, tf.expand_dims(reward, axis=2)),
            sequence_length=sequence_length,
            initial_state=estimate_map,
            swap_memory=True)
        m['estimate_map_list'] = interm_beliefs
        return final_belief
Exemple #44
0
def _batch_norm_fn(x, scope=None):
    if scope is None:
        scope = tf.get_variable_scope().name + "/bn"
    return slim.batch_norm(x, scope=scope)
Exemple #45
0
def build_frrn(inputs, num_classes, preset_model='FRRN-A'):
    """
    Builds the Full Resolution Residual Network model. 

    Arguments:
      inputs: The input tensor
      preset_model: Which model you want to use. Select FRRN-A or FRRN-B
      num_classes: Number of classes

    Returns:
      FRRN model
    """

    if preset_model == 'FRRN-A':

        #####################
        # Initial Stage   
        #####################
        net = slim.conv2d(inputs, 48, kernel_size=5, activation_fn=None)
        net = slim.batch_norm(net)
        net = tf.nn.relu(net)

        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)


        #####################
        # Downsampling Path 
        #####################
        pool_stream = slim.pool(net, [2, 2], stride=[2, 2], pooling_type='MAX')
        res_stream = slim.conv2d(net, 32, kernel_size=1, activation_fn=None)

        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') 
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX')
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX')
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16)

        #####################
        # Upsampling Path 
        #####################
        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8)

        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)

        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)

        pool_stream = Unpooling(pool_stream, 2)

        #####################
        # Final Stage 
        #####################
        net = tf.concat([pool_stream, res_stream], axis=-1)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)

        net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, scope='logits')
        return net

        
    elif preset_model == 'FRRN-B':
        #####################
        # Initial Stage   
        #####################
        net = slim.conv2d(inputs, 48, kernel_size=5, activation_fn=None)
        net = slim.batch_norm(net)
        net = tf.nn.relu(net)

        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)


        #####################
        # Downsampling Path 
        #####################
        pool_stream = slim.pool(net, [2, 2], stride=[2, 2], pooling_type='MAX')
        res_stream = slim.conv2d(net, 32, kernel_size=1, activation_fn=None)

        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX') 
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX')
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=8)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX')
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=16)

        pool_stream = slim.pool(pool_stream, [2, 2], stride=[2, 2], pooling_type='MAX')
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=32)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=384, n_filters_1=32, pool_scale=32)

        #####################
        # Upsampling Path 
        #####################
        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=17)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=16)

        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=8)

        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=192, n_filters_1=32, pool_scale=4)

        pool_stream = Unpooling(pool_stream, 2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)
        pool_stream, res_stream = FullResolutionResidualUnit(pool_stream=pool_stream, res_stream=res_stream, n_filters_3=96, n_filters_1=32, pool_scale=2)

        pool_stream = Unpooling(pool_stream, 2)

        #####################
        # Final Stage 
        #####################
        net = tf.concat([pool_stream, res_stream], axis=-1)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)
        net = ResidualUnit(net, n_filters=48, filter_size=3)

        net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, scope='logits')
        return net

    else:
        raise ValueError("Unsupported FRRN model '%s'. This function only supports FRRN-A and FRRN-B" % (preset_model)) 
Exemple #46
0
 def batch_norm(net):
     net = slim.batch_norm(net, center=center, scale=True, epsilon=1e-5, is_training=training)
     if not center:
         net = tf.nn.bias_add(net, slim.variable('biases', shape=[tf.shape(net)[-1]], initializer=tf.zeros_initializer()))
     return net
    def forward(self,reshaped_input):
        cluster_weights = tf.get_variable("cluster_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
     
        covar_weights = tf.get_variable("covar_weights",
          [self.feature_size, self.cluster_size],
          initializer = tf.random_normal_initializer(mean=1.0, stddev=1 /math.sqrt(self.feature_size)))
      
        covar_weights = tf.square(covar_weights)
        eps = tf.constant([1e-6])
        covar_weights = tf.add(covar_weights,eps)

        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if self.add_batch_norm:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=self.is_training,
              scope="cluster_bn")
        else:
          cluster_biases = tf.get_variable("cluster_biases",
            [self.cluster_size],
            initializer = tf.random_normal(stddev=1 / math.sqrt(self.feature_size)))
          tf.summary.histogram("cluster_biases", cluster_biases)
          activation += cluster_biases
        
        activation = tf.nn.softmax(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, self.max_frames, self.cluster_size])

        a_sum = tf.reduce_sum(activation,-2,keep_dims=True)

        if not FLAGS.fv_couple_weights:
            cluster_weights2 = tf.get_variable("cluster_weights2",
              [1,self.feature_size, self.cluster_size],
              initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(self.feature_size)))
        else:
            cluster_weights2 = tf.scalar_mul(FLAGS.fv_coupling_factor,cluster_weights)

        a = tf.multiply(a_sum,cluster_weights2)
        
        activation = tf.transpose(activation,perm=[0,2,1])
        
        reshaped_input = tf.reshape(reshaped_input,[-1,self.max_frames,self.feature_size])
        fv1 = tf.matmul(activation,reshaped_input)
        
        fv1 = tf.transpose(fv1,perm=[0,2,1])

        # computing second order FV
        a2 = tf.multiply(a_sum,tf.square(cluster_weights2)) 

        b2 = tf.multiply(fv1,cluster_weights2) 
        fv2 = tf.matmul(activation,tf.square(reshaped_input)) 
     
        fv2 = tf.transpose(fv2,perm=[0,2,1])
        fv2 = tf.add_n([a2,fv2,tf.scalar_mul(-2,b2)])

        fv2 = tf.divide(fv2,tf.square(covar_weights))
        fv2 = tf.subtract(fv2,a_sum)

        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
      
        fv2 = tf.nn.l2_normalize(fv2,1)
        fv2 = tf.reshape(fv2,[-1,self.cluster_size*self.feature_size])
        fv2 = tf.nn.l2_normalize(fv2,1)

        fv1 = tf.subtract(fv1,a)
        fv1 = tf.divide(fv1,covar_weights) 

        fv1 = tf.nn.l2_normalize(fv1,1)
        fv1 = tf.reshape(fv1,[-1,self.cluster_size*self.feature_size])
        fv1 = tf.nn.l2_normalize(fv1,1)

        return tf.concat([fv1,fv2],1)
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str)
    dataset_name = args[1]
    model_name = args[2]

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(
        dataset_name, is_train=False, epochs=cfg.epoch)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = 2  # int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_squash = tf.divide(batch_x, 255.)
        batch_x_norm = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False)
        output, pose_out = net.build_arch(batch_x_norm, coord_add,
                                          is_train=False, num_classes=num_classes)
        tf.logging.debug(pose_out.get_shape())

        batch_acc = net.test_accuracy(output, batch_labels)
        m_op = tf.constant(0.9)
        loss, spread_loss, mse, recon_img_squash = net.spread_loss(
            output, pose_out, batch_squash, batch_labels, m_op)
        tf.summary.scalar('spread_loss', spread_loss)
        tf.summary.scalar('reconstruction_loss', mse)
        tf.summary.scalar('all_loss', loss)
        data_size = int(batch_x.get_shape()[1])
        recon_img = tf.multiply(tf.reshape(recon_img_squash, shape=[
                                cfg.batch_size, data_size, data_size, 1]), 255.)
        orig_img = tf.reshape(batch_x, shape=[
            cfg.batch_size, data_size, data_size, 1])
        tf.summary.image('orig_image', orig_img)
        tf.summary.image('recon_image', recon_img)
        saver = tf.train.Saver()

        step = 0

        tf.summary.scalar('accuracy', batch_acc)
        summary_op = tf.summary.merge_all()

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(14, 15):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(
                            cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str, orig_image, recon_image = sess.run(
                        [batch_acc, summary_op, orig_img, recon_img])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1
                    # display original/reconstructed images in matplotlib
                    plot_imgs(orig_image, i, 'ori')
                    plot_imgs(recon_image, i, 'rec')

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.dbof_cluster_size
    hidden1_size = hidden_size or FLAGS.dbof_hidden_size
    fc_dimred = FLAGS.fc_dimred
    relu = FLAGS.dbof_relu
    max_pool = FLAGS.softdbof_maxpool

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    video_Dbof = SoftDBoF(1024,max_frames,cluster_size, max_pool, add_batch_norm, is_training)
    audio_Dbof = SoftDBoF(128,max_frames,cluster_size/8, max_pool, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_DBOF"):
        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_DBOF"):
        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])

    dbof = tf.concat([dbof_video, dbof_audio],1)

    dbof_dim = dbof.get_shape().as_list()[1] 

    if fc_dimred:
        hidden1_weights = tf.get_variable("hidden1_weights",
          [dbof_dim, hidden1_size],
          initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(dbof, hidden1_weights)

        if add_batch_norm and relu:
          activation = slim.batch_norm(
              activation,
              center=True,
              scale=True,
              is_training=is_training,
              scope="hidden1_bn")
        else:
          hidden1_biases = tf.get_variable("hidden1_biases",
            [hidden1_size],
            initializer = tf.random_normal_initializer(stddev=0.01))
          tf.summary.histogram("hidden1_biases", hidden1_biases)
          activation += hidden1_biases

        if relu:
          activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)
    else:
        activation = dbof

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)

    
    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        is_training=is_training,
        **unused_params)
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.dbof_cluster_size
    hidden1_size = hidden_size or FLAGS.dbof_hidden_size

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    cluster_weights = tf.get_variable("cluster_weights",
      [feature_size, cluster_size],
      initializer = tf.random_normal_initializer(stddev=1 / math.sqrt(feature_size)))
    tf.summary.histogram("cluster_weights", cluster_weights)
    activation = tf.matmul(reshaped_input, cluster_weights)
    if add_batch_norm:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="cluster_bn")
    else:
      cluster_biases = tf.get_variable("cluster_biases",
        [cluster_size],
        initializer = tf.random_normal(stddev=1 / math.sqrt(feature_size)))
      tf.summary.histogram("cluster_biases", cluster_biases)
      activation += cluster_biases
    activation = tf.nn.relu6(activation)
    tf.summary.histogram("cluster_output", activation)

    activation = tf.reshape(activation, [-1, max_frames, cluster_size])
    activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

    hidden1_weights = tf.get_variable("hidden1_weights",
      [cluster_size, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
    tf.summary.histogram("hidden1_weights", hidden1_weights)
    activation = tf.matmul(activation, hidden1_weights)
    if add_batch_norm:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")
    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases
    activation = tf.nn.relu6(activation)
    tf.summary.histogram("hidden1_output", activation)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)
    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        **unused_params)
Exemple #51
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.netvlad_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.netvlad_cluster_size
        hidden1_size = hidden_size or FLAGS.netvlad_hidden_size
        relu = FLAGS.netvlad_relu
        dimred = FLAGS.netvlad_dimred
        gating = FLAGS.gating
        remove_diag = FLAGS.gating_remove_diag
        lightvlad = FLAGS.lightvlad
        vlagd = FLAGS.vlagd
        SVD_dim = FLAGS.SVD_dim

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)

        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])

        video_NetVLAD = LightVLAD(1024, max_frames, int(cluster_size),
                                  add_batch_norm, is_training)
        audio_NetVLAD = LightVLAD(128, max_frames, int(cluster_size / 2),
                                  add_batch_norm, is_training)

        if add_batch_norm:  # and not lightvlad:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        with tf.variable_scope("video_VLAD"):
            vlad_video = video_NetVLAD.forward(reshaped_input[:, 0:1024])

        with tf.variable_scope("audio_VLAD"):
            vlad_audio = audio_NetVLAD.forward(reshaped_input[:, 1024:])

        vlad = tf.concat([vlad_video, vlad_audio], 1)  # None x vlad_dim

        vlad_dim = vlad.get_shape().as_list()[1]

        ##### simplier SVD #####
        SVD_mat1 = tf.get_variable("hidden1_weights", [vlad_dim, SVD_dim],
                                   initializer=tf.glorot_uniform_initializer())

        SVD_mat2 = tf.get_variable("hidden2_weights",
                                   [SVD_dim, int(hidden1_size * 2)],
                                   initializer=tf.glorot_uniform_initializer())

        SVD_mat1_biases = tf.get_variable(
            "SVD_mat1_biases", [SVD_dim],
            initializer=tf.random_normal_initializer(stddev=0.01))

        SVD_mat2_biases = tf.get_variable(
            "SVD_mat2_biases", [int(hidden1_size * 2)],
            initializer=tf.random_normal_initializer(stddev=0.01))
        ##### simplier SVD #####

        activation = tf.matmul(vlad, SVD_mat1)  # None x 256
        activation += SVD_mat1_biases
        activation = tf.matmul(activation, SVD_mat2)  # None x 2*hidden1_size
        activation += SVD_mat2_biases
        tf.summary.histogram("activation_before_bn", activation)

        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden1_bn")
        tf.summary.histogram("activation_after_bn", activation)

        ## gating part
        gating_weights = tf.get_variable(
            "gating_weights_2", [int(2 * hidden1_size), hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(hidden1_size)))

        gates = tf.matmul(activation, gating_weights)

        gates = slim.batch_norm(gates,
                                center=True,
                                scale=True,
                                is_training=is_training,
                                scope="gating_bn")

        gates = tf.sigmoid(gates)
        tf.summary.histogram("gates_layer", gates)
        ## gating part

        ## hidden layer
        activation = tf.nn.tanh(activation)
        tf.summary.histogram("activation_after_bn_after_1_tanh", activation)

        activation_hidden_weights = tf.get_variable(
            "activation_hidden_weights", [int(hidden1_size * 2), hidden1_size],
            initializer=tf.glorot_uniform_initializer())

        activation = tf.matmul(activation, activation_hidden_weights)
        activation = slim.batch_norm(activation,
                                     center=True,
                                     scale=True,
                                     is_training=is_training,
                                     scope="hidden_layer_bn")
        tf.summary.histogram("activation_after_bn_after_1_tanh_after_bn",
                             activation)

        activation = tf.nn.tanh(activation)
        tf.summary.histogram(
            "activation_after_bn_after_1_tanh_after_bn_after_2_tanh",
            activation)

        activation = tf.multiply(activation, gates)
        tf.summary.histogram("activation_right_before_video", activation)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)

        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               is_training=is_training,
                                               **unused_params)
Exemple #52
0
    def build_CVAE_v2(self):
        with tf.variable_scope('Encoder', reuse=self.reuse_variables):
            """
            Q(Z|X)(approximate posterior distribution) encoder : Multivariate Gaussian
            """
            "Initial conv block, H/2"
            net = slim.conv2d(inputs=self.model_input,
                              num_outputs=32,
                              kernel_size=7,
                              stride=2,
                              scope='conv1')
            net = slim.batch_norm(net, is_training=False)
            net = tf.nn.relu(net)
            net = slim.max_pool2d(net, 3, stride=2, padding='SAME')

            # "H/4"
            # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=2)
            # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=1)
            #
            # "H/8"
            # net = slim.conv2d(inputs=net, num_outputs=128, kernel_size=3, stride=2)
            # net = slim.conv2d(inputs=net, num_outputs=128, kernel_size=3, stride=1)
            #
            # "H/16"
            # net = slim.conv2d(inputs=net, num_outputs=256, kernel_size=3, stride=2)
            # net = slim.conv2d(inputs=net, num_outputs=256, kernel_size=3, stride=1)
            #
            # "H/32"
            # net = slim.conv2d(inputs=net, num_outputs=512, kernel_size=3, stride=2)
            # net = slim.conv2d(inputs=net, num_outputs=512, kernel_size=3, stride=1)

            # "H/4"
            # net = slim.max_pool2d(net, kernel_size=3, stride=2, padding='SAME')
            #
            # "H/8"
            # net = slim.conv2d(inputs=net, num_outputs=64, kernel_size=3, stride=2)
            #
            # "dilated dense convolution"
            # net = self.denseconv(net, num_outputs=64, kernel_size=3, dilation_rate=3)
            # net = self.denseconv(net, num_outputs=64, kernel_size=3, dilation_rate=6)
            #
            # "denseblock series, H/8"
            # conv_dense = self.conv_denseblock(net, num_outputs=128, kernel_size=3)
            # max_dense = self.max_denseblock(net, num_outputs=128, kernel_size=3)
            # dense = tf.concat([conv_dense, max_dense], axis=3)
            #
            # "H/32"
            # net = self.conv_denseblock(dense, num_outputs=512, kernel_size=3)

            "mean / variation"
            self.z_mu = slim.avg_pool2d(net,
                                        kernel_size=3,
                                        stride=1,
                                        padding='SAME')
            self.z_log_var = slim.avg_pool2d(net**2, 3, 1,
                                             'SAME') - self.z_mu**2

            with tf.variable_scope('Sampling_z'):
                """
                sampling z using reparameterization trick
                """
                epsilon = tf.random_normal(shape=tf.shape(self.z_mu),
                                           dtype=tf.float32)
                self.z_sample = self.z_mu + tf.exp(
                    self.z_log_var / 2.) * epsilon

        with tf.variable_scope('Decoder', reuse=self.reuse_variables):
            """
            P(X|Z) (likelihood)
            """
            self.de_net = slim.conv2d_transpose(inputs=self.z_sample,
                                                num_outputs=256,
                                                kernel_size=3,
                                                stride=2,
                                                scope='deconv1')
            self.de_net = slim.conv2d_transpose(inputs=self.de_net,
                                                num_outputs=128,
                                                kernel_size=3,
                                                stride=2,
                                                scope='deconv2')
            self.de_net = slim.conv2d_transpose(inputs=self.de_net,
                                                num_outputs=32,
                                                kernel_size=3,
                                                stride=2,
                                                scope='deconv3')
            self.de_net = slim.conv2d_transpose(inputs=self.de_net,
                                                num_outputs=16,
                                                kernel_size=3,
                                                stride=2,
                                                scope='deconv4')
            # conv = slim.conv2d(self.de_net, num_outputs=8, kernel_size=3, stride=1)
            self.logits = slim.conv2d_transpose(inputs=self.de_net,
                                                num_outputs=3,
                                                kernel_size=3,
                                                stride=2,
                                                activation_fn=tf.nn.elu)
Exemple #53
0
    def build_AE(self):
        with tf.variable_scope('Encoder', reuse=self.reuse_variables):
            "Initial conv block, H/2"
            net = slim.conv2d(inputs=self.model_input,
                              num_outputs=32,
                              kernel_size=7,
                              stride=2,
                              scope='conv1')
            net = slim.batch_norm(net)
            net = tf.nn.relu(net)

            "H/4"
            net = slim.conv2d(inputs=net,
                              num_outputs=32,
                              kernel_size=3,
                              stride=2)

            "H/8"
            net = slim.max_pool2d(net, kernel_size=3, stride=2, padding='SAME')
            conv2_net = slim.conv2d(inputs=net,
                                    num_outputs=64,
                                    kernel_size=3,
                                    stride=1,
                                    scope='conv2')

            "denseblock series, H/8"
            conv_dense = self.conv_denseblock(conv2_net,
                                              num_outputs=128,
                                              kernel_size=3)
            max_dense = self.max_denseblock(conv2_net,
                                            num_outputs=128,
                                            kernel_size=3)
            dense = tf.concat([conv_dense, max_dense], axis=3)

            "dilated dense convolution"
            net = self.denseconv(dense,
                                 num_outputs=512,
                                 kernel_size=3,
                                 dilation_rate=3)
            net = self.denseconv(net,
                                 num_outputs=512,
                                 kernel_size=3,
                                 dilation_rate=6)

            # "H/16"
            # net = slim.avg_pool2d(net, kernel_size=3, padding='SAME')
        with tf.variable_scope('Decoder', reuse=self.reuse_variables):
            "H/4"
            self.de_net = slim.conv2d_transpose(
                inputs=net,
                num_outputs=256,
                kernel_size=3,
                stride=2,
                scope='deconv1')  # (B, 76, 76, 256)
            conv_denet = slim.conv2d(inputs=self.de_net,
                                     num_outputs=128,
                                     kernel_size=3)

            "H/2"
            self.de_net2 = slim.conv2d_transpose(
                inputs=conv_denet,
                num_outputs=64,
                kernel_size=3,
                stride=2,
                scope='deconv2')  # (B, 152, 152, 128)
            conv_denet = slim.conv2d(inputs=self.de_net2,
                                     num_outputs=32,
                                     kernel_size=3)

            "H"
            self.de_net3 = slim.conv2d_transpose(inputs=conv_denet,
                                                 num_outputs=16,
                                                 kernel_size=3,
                                                 stride=2,
                                                 scope='deconv3')
            self.logits = slim.conv2d(inputs=self.de_net3,
                                      num_outputs=1,
                                      kernel_size=3)
def get_embd(inputs,
             is_training_dropout,
             is_training_bn,
             config,
             reuse=False,
             scope='embd_extractor'):
    with tf.variable_scope(scope, reuse=reuse):
        net = inputs
        end_points = {}
        if config['backbone_type'].startswith('resnet_v2_m'):
            arg_sc = modifiedResNet_v2.resnet_arg_scope(
                weight_decay=config['weight_decay'],
                batch_norm_decay=config['bn_decay'])
            with slim.arg_scope(arg_sc):
                if config['backbone_type'] == 'resnet_v2_m_50':
                    net, end_points = modifiedResNet_v2.resnet_v2_m_50(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_m_101':
                    net, end_points = modifiedResNet_v2.resnet_v2_m_101(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_m_152':
                    net, end_points = modifiedResNet_v2.resnet_v2_m_152(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_m_200':
                    net, end_points = modifiedResNet_v2.resnet_v2_m_200(
                        net, is_training=is_training_bn, return_raw=True)
                else:
                    raise ValueError('Invalid backbone type.')
        elif config['backbone_type'] == 'mbv3':
            net = mobilenetv3(
                net,
                mode='face.large',
                is_train=is_training_bn,
            )
        elif config['backbone_type'].startswith('resnet_v2'):
            arg_sc = ResNet_v2.resnet_arg_scope(
                weight_decay=config['weight_decay'],
                batch_norm_decay=config['bn_decay'])
            with slim.arg_scope(arg_sc):
                if config['backbone_type'] == 'resnet_v2_50':
                    net, end_points = ResNet_v2.resnet_v2_50(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_101':
                    net, end_points = ResNet_v2.resnet_v2_101(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_152':
                    net, end_points = ResNet_v2.resnet_v2_152(
                        net, is_training=is_training_bn, return_raw=True)
                elif config['backbone_type'] == 'resnet_v2_200':
                    net, end_points = ResNet_v2.resnet_v2_200(
                        net, is_training=is_training_bn, return_raw=True)
        else:
            raise ValueError('Invalid backbone type.')

        if config['out_type'] == 'E':
            with slim.arg_scope(arg_sc):
                net = slim.batch_norm(net,
                                      activation_fn=None,
                                      is_training=is_training_bn)
                net = slim.dropout(net,
                                   keep_prob=config['keep_prob'],
                                   is_training=is_training_dropout)
                net = slim.flatten(net)
                net = slim.fully_connected(net,
                                           config['embd_size'],
                                           normalizer_fn=None,
                                           activation_fn=None)
                net = slim.batch_norm(net,
                                      scale=False,
                                      activation_fn=None,
                                      is_training=is_training_bn)
                end_points['embds'] = net
        elif config['out_type'] == 'N':
            end_points['embds'] = net
        else:
            raise ValueError('Invalid out type.')

        return net, end_points
def mobilenet(inputs,
              num_classes=1000,
              is_training=True,
              width_multiplier=3,
              scope='MobileNet'):
    """ MobileNet
     More detail, please refer to Google's paper(https://arxiv.org/abs/1704.04861).
     Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    scope: Optional scope for the variables.
    Returns:
    logits: the pre-softmax activations, a tensor of size
      [batch_size, `num_classes`]
    end_points: a dictionary from components of the network to the corresponding
       activation.
    """
    def _depthwise_separable_conv(inputs,
                                  num_pwc_filters,
                                  width_multiplier,
                                  sc,
                                  downsample=False):
        """ Helper function to build the depth-wise separable convolution layer.
        """
        num_pwc_filters = round(num_pwc_filters * width_multiplier)
        _stride = 2 if downsample else 1

        # skip pointwise by setting num_outputs=None
        depthwise_conv = slim.separable_convolution2d(inputs,
                                                      num_outputs=None,
                                                      stride=_stride,
                                                      depth_multiplier=1,
                                                      kernel_size=[3, 3],
                                                      scope=sc +
                                                      '/depthwise_conv')

        bn = slim.batch_norm(depthwise_conv, scope=sc + '/dw_batch_norm')
        pointwise_conv = slim.convolution2d(bn,
                                            num_pwc_filters,
                                            kernel_size=[1, 1],
                                            scope=sc + '/pointwise_conv')
        bn = slim.batch_norm(pointwise_conv, scope=sc + '/pw_batch_norm')
        return bn

    with tf.variable_scope(scope) as sc:
        end_points_collection = sc.name + '_end_points'
        with slim.arg_scope([slim.convolution2d, slim.separable_convolution2d],
                            activation_fn=None,
                            outputs_collections=[end_points_collection]):
            with slim.arg_scope([slim.batch_norm],
                                is_training=is_training,
                                activation_fn=tf.nn.relu,
                                fused=True):
                net = slim.convolution2d(inputs,
                                         round(32 * width_multiplier), [3, 3],
                                         stride=2,
                                         padding='SAME',
                                         scope='conv_1')
                net = slim.batch_norm(net, scope='conv_1/batch_norm')

                net = _depthwise_separable_conv(net,
                                                64,
                                                width_multiplier,
                                                sc='conv_ds_2')
                net = _depthwise_separable_conv(net,
                                                128,
                                                width_multiplier,
                                                downsample=True,
                                                sc='conv_ds_3')
                net = _depthwise_separable_conv(net,
                                                128,
                                                width_multiplier,
                                                sc='conv_ds_4')
                net = _depthwise_separable_conv(net,
                                                256,
                                                width_multiplier,
                                                downsample=True,
                                                sc='conv_ds_5')
                net = _depthwise_separable_conv(net,
                                                256,
                                                width_multiplier,
                                                sc='conv_ds_6')
                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                downsample=True,
                                                sc='conv_ds_7')

                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                sc='conv_ds_8')
                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                sc='conv_ds_9')
                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                sc='conv_ds_10')
                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                sc='conv_ds_11')
                net = _depthwise_separable_conv(net,
                                                512,
                                                width_multiplier,
                                                sc='conv_ds_12')

                net = _depthwise_separable_conv(net,
                                                1024,
                                                width_multiplier,
                                                downsample=True,
                                                sc='conv_ds_13')
                net = _depthwise_separable_conv(net,
                                                1024,
                                                width_multiplier,
                                                sc='conv_ds_14')
                net = slim.avg_pool2d(net, [7, 7], scope='avg_pool_15')

        def get_tensor_aliases(tensor):
            """Get a list with the aliases of the input tensor.
            If the tensor does not have any alias, it would default to its its op.name or
            its name.
            Args:
              tensor: A `Tensor`.
            Returns:
              A list of strings with the aliases of the tensor.
            """
            if hasattr(tensor, 'aliases'):
                aliases = tensor.aliases
            else:
                if tensor.name[-2:] == ':0':
                    # Use op.name for tensor ending in :0
                    aliases = [tensor.op.name]
                else:
                    aliases = [tensor.name]
            return aliases

        from tensorflow.python.framework import ops
        for tensor in ops.get_collection(end_points_collection):
            for alias in get_tensor_aliases(tensor):
                # print(alias)
                pass
        end_points = slim.utils.convert_collection_to_dict(
            end_points_collection)
        net = tf.squeeze(net, [0], name='SpatialSqueeze')
        end_points['squeeze'] = net
        logits = slim.fully_connected(net,
                                      num_classes,
                                      activation_fn=None,
                                      scope='fc_16')
        predictions = slim.softmax(logits, scope='Predictions')

        end_points['Logits'] = logits
        end_points['Predictions'] = predictions

    return logits, end_points
Exemple #56
0
    def D(self, inputs, reuse=False, training=True):
        """
        feed forward procedure
        :param inputs:  shape [batch_size, time_step, channel]
        :return:
        """
        keep_prob = 1.0
        if training:
            keep_prob = 0.5
        norm_scale = False
        with tf.variable_scope("D", reuse=reuse):
            with tf.name_scope("Reshaping_data") as scope:
                x_image = tf.reshape(inputs, [-1, self.config.FEATURE_LEN, 1, 1])
            with tf.name_scope("Conv1") as scope:
                a_conv1 = slim.conv2d(x_image, num_outputs=self.config.num_filt_1, kernel_size=[5, 1], scope='conv1')

            with tf.name_scope('Batch_norm_conv1') as scope:
                a_conv1 = slim.batch_norm(a_conv1, is_training=self.bn_train, scale=norm_scale,
                                          updates_collections=None)
                h_conv1 = tf.nn.relu(a_conv1)

                # h_conv1 = slim.avg_pool2d(h_conv1, kernel_size=2, stride=2, padding='SAME')

            with tf.variable_scope('classification_branch'):
                with tf.name_scope("Conv2") as scope:
                    W_conv2 = tf.get_variable("Conv_Layer_2",
                                              shape=[4, 1, self.config.num_filt_1, self.config.num_filt_2],
                                              initializer=initializer)
                    b_conv2 = bias_variable([self.config.num_filt_2], 'bias_for_Conv_Layer_2')
                    a_conv2 = conv2d(h_conv1, W_conv2) + b_conv2

                with tf.name_scope('Batch_norm_conv2') as scope:
                    a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train, scale=norm_scale,
                                              updates_collections=None)
                    h_conv2 = tf.nn.relu(a_conv2)
                    # h_conv2 = slim.max_pool2d(h_conv2, kernel_size=2, stride=2, padding="SAME")

                if self.config.DATASET_NAME != 'Adiac':
                    with tf.name_scope("Conv3") as scope:
                        W_conv2 = tf.get_variable("Conv_Layer_3",
                                                  shape=[4, 1, self.config.num_filt_2, self.config.num_filt_3],
                                                  initializer=initializer)
                        b_conv2 = bias_variable([self.config.num_filt_3], 'bias_for_Conv_Layer_3')
                        a_conv2 = conv2d(h_conv2, W_conv2, kernel=2) + b_conv2

                    with tf.name_scope('Batch_norm_conv3') as scope:
                        a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train,
                                                  scale=True, updates_collections=None)
                        h_conv2 = tf.nn.relu(a_conv2)

                with tf.name_scope("Fully_Connected1") as scope:
                    W_fc1 = tf.get_variable("Fully_Connected_layer_1",
                                            shape=[np.prod(h_conv2.get_shape().as_list()[1:]), self.config.num_fc_1],
                                            initializer=initializer)
                    b_fc1 = bias_variable([self.config.num_fc_1], 'bias_for_Fully_Connected_Layer_1')
                    h_conv3_flat = tf.reshape(h_conv2, [-1, np.prod(h_conv2.get_shape().as_list()[1:])])

                    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

                with tf.name_scope("Fully_Connected2") as scope:
                    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
                    W_fc2 = tf.get_variable("W_fc2", shape=[self.config.num_fc_1, self.config.NUM_CLASSES],
                                            initializer=initializer)
                    b_fc2 = tf.Variable(tf.constant(0.1, shape=[self.config.NUM_CLASSES]), name='b_fc2')
                    logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

            with tf.variable_scope('real_fake_branch'):
                with tf.name_scope("Conv2") as scope:
                    W_conv2 = tf.get_variable("Conv_Layer_2",
                                              shape=[4, 1, self.config.num_filt_1, self.config.num_filt_2],
                                              initializer=initializer)
                    b_conv2 = bias_variable([self.config.num_filt_2], 'bias_for_Conv_Layer_2')
                    a_conv2 = conv2d(h_conv1, W_conv2) + b_conv2

                with tf.name_scope('Batch_norm_conv2') as scope:
                    a_conv2 = slim.batch_norm(a_conv2, is_training=self.bn_train, updates_collections=None)
                    h_conv2 = tf.nn.relu(a_conv2)

                with tf.name_scope("Fully_Connected1") as scope:
                    W_fc1 = tf.get_variable("Fully_Connected_layer_1",
                                            shape=[self.config.FEATURE_LEN * self.config.num_filt_2,
                                                   self.config.num_fc_1],
                                            initializer=initializer)
                    b_fc1 = bias_variable([self.config.num_fc_1], 'bias_for_Fully_Connected_Layer_1')
                    h_conv3_flat = tf.reshape(h_conv2, [-1, self.config.FEATURE_LEN * self.config.num_filt_2])
                    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

                with tf.name_scope("Fully_Connected2") as scope:
                    h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
                    W_fc2 = tf.get_variable("W_fc2", shape=[self.config.num_fc_1, self.config.NUM_CLASSES],
                                            initializer=initializer)
                    b_fc2 = tf.Variable(tf.constant(0.1, shape=[1]), name='b_fc2')
                    real_fake_logits = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
                    real_fake_logits = tf.sigmoid(real_fake_logits)

            return logits, real_fake_logits
Exemple #57
0
    def forward(self, reshaped_input):

        cluster_weights = tf.get_variable(
            "cluster_weights", [self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))

        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)

        if self.add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=self.is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal_initializer(
                    stddev=1 / math.sqrt(self.feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases

        activation = tf.nn.softmax(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation,
                                [-1, self.max_frames, self.cluster_size])

        a_sum = tf.reduce_sum(activation, -2, keep_dims=True)

        cluster_weights2 = tf.get_variable(
            "cluster_weights2", [1, self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))

        a = tf.multiply(a_sum, cluster_weights2)

        activation = tf.transpose(activation, perm=[0, 2, 1])

        reshaped_input = tf.reshape(reshaped_input,
                                    [-1, self.max_frames, self.feature_size])
        vlad = tf.matmul(activation, reshaped_input)
        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.subtract(vlad, a)

        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.reshape(vlad, [-1, self.feature_size])

        vlad_softmax = self.embedgaussian_relation(vlad, 1 / float(64))

        nonlocal_g = tf.get_variable(
            "nonlocal_g", [self.feature_size, self.cluster_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.feature_size)))
        nonlocal_out = tf.get_variable(
            "nonlocal_out", [self.cluster_size, self.feature_size],
            initializer=tf.random_normal_initializer(
                stddev=1 / math.sqrt(self.cluster_size)))

        vlad_g = tf.matmul(vlad, nonlocal_g)
        vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.cluster_size])
        vlad_g = tf.matmul(vlad_softmax, vlad_g)
        vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size])

        vlad_g = tf.matmul(vlad_g, nonlocal_out)
        vlad_g = tf.reshape(vlad_g, [-1, self.cluster_size, self.feature_size])
        vlad = tf.reshape(vlad, [-1, self.cluster_size, self.feature_size])
        vlad = vlad + vlad_g

        vlad = tf.transpose(vlad, perm=[0, 2, 1])
        vlad = tf.nn.l2_normalize(vlad, 1)  # [b,f,c]

        vlad = tf.reshape(vlad, [-1, self.cluster_size * self.feature_size])
        vlad = tf.nn.l2_normalize(vlad, 1)

        return vlad
def main(args):
    """Get dataset hyperparameters."""
    assert len(args) == 3 and isinstance(args[1], str) and isinstance(args[2], str)
    dataset_name = args[1]
    model_name = args[2]
    coord_add = get_coord_add(dataset_name)
    dataset_size_train = get_dataset_size_train(dataset_name)
    dataset_size_test = get_dataset_size_test(dataset_name)
    num_classes = get_num_classes(dataset_name)
    create_inputs = get_create_inputs(
        dataset_name, is_train=False, epochs=cfg.epoch)

    """Set reproduciable random seed"""
    tf.set_random_seed(1234)

    with tf.Graph().as_default():
        num_batches_per_epoch_train = int(dataset_size_train / cfg.batch_size)
        num_batches_test = int(dataset_size_test / cfg.batch_size * 0.1)

        batch_x, batch_labels = create_inputs()
        batch_x = slim.batch_norm(batch_x, center=False, is_training=False, trainable=False)
        if model_name == "caps":
            output, _ = net.build_arch(batch_x, coord_add,
                                       is_train=False, num_classes=num_classes)
        elif model_name == "cnn_baseline":
            output = net.build_arch_baseline(batch_x,
                                             is_train=False, num_classes=num_classes)
        else:
            raise "Please select model from 'caps' or 'cnn_baseline' as the secondary argument of eval.py!"
        batch_acc = net.test_accuracy(output, batch_labels)
        saver = tf.train.Saver()

        step = 0

        summaries = []
        summaries.append(tf.summary.scalar('accuracy', batch_acc))
        summary_op = tf.summary.merge(summaries)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, log_device_placement=False)) as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            if not os.path.exists(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name)):
                os.makedirs(cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name))
            summary_writer = tf.summary.FileWriter(
                cfg.test_logdir + '/{}/{}/'.format(model_name, dataset_name), graph=sess.graph)  # graph=sess.graph, huge!

            files = os.listdir(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name))
            for epoch in range(1, cfg.epoch):
                # requires a regex to adapt the loss value in the file name here
                ckpt_re = ".ckpt-%d" % (num_batches_per_epoch_train * epoch)
                for __file in files:
                    if __file.endswith(ckpt_re + ".index"):
                        ckpt = os.path.join(cfg.logdir + '/{}/{}/'.format(model_name, dataset_name), __file[:-6])
                # ckpt = os.path.join(cfg.logdir, "model.ckpt-%d" % (num_batches_per_epoch_train * epoch))
                saver.restore(sess, ckpt)

                accuracy_sum = 0
                for i in range(num_batches_test):
                    batch_acc_v, summary_str = sess.run([batch_acc, summary_op])
                    print('%d batches are tested.' % step)
                    summary_writer.add_summary(summary_str, step)

                    accuracy_sum += batch_acc_v

                    step += 1

                ave_acc = accuracy_sum / num_batches_test
                print('the average accuracy is %f' % ave_acc)

            coord.join(threads)
  def create_model(self,
                   model_input,
                   vocab_size,
                   num_frames,
                   iterations=None,
                   add_batch_norm=None,
                   sample_random_frames=None,
                   cluster_size=None,
                   hidden_size=None,
                   is_training=True,
                   **unused_params):
    iterations = iterations or FLAGS.iterations
    add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
    random_frames = sample_random_frames or FLAGS.sample_random_frames
    cluster_size = cluster_size or FLAGS.dbof_cluster_size
    hidden1_size = hidden_size or FLAGS.dbof_hidden_size
    relu = FLAGS.dbof_relu
    cluster_activation = FLAGS.dbof_activation

    num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
    if random_frames:
      model_input = utils.SampleRandomFrames(model_input, num_frames,
                                             iterations)
    else:
      model_input = utils.SampleRandomSequence(model_input, num_frames,
                                               iterations)
    max_frames = model_input.get_shape().as_list()[1]
    feature_size = model_input.get_shape().as_list()[2]
    reshaped_input = tf.reshape(model_input, [-1, feature_size])
    tf.summary.histogram("input_hist", reshaped_input)

    if cluster_activation == 'glu':
        cluster_size = 2*cluster_size

    video_Dbof = DBoF(1024,max_frames,cluster_size, cluster_activation, add_batch_norm, is_training)
    audio_Dbof = DBoF(128,max_frames,cluster_size/8, cluster_activation, add_batch_norm, is_training)


    if add_batch_norm:
      reshaped_input = slim.batch_norm(
          reshaped_input,
          center=True,
          scale=True,
          is_training=is_training,
          scope="input_bn")

    with tf.variable_scope("video_DBOF"):
        dbof_video = video_Dbof.forward(reshaped_input[:,0:1024]) 

    with tf.variable_scope("audio_DBOF"):
        dbof_audio = audio_Dbof.forward(reshaped_input[:,1024:])

    dbof = tf.concat([dbof_video, dbof_audio],1)

    dbof_dim = dbof.get_shape().as_list()[1] 

    hidden1_weights = tf.get_variable("hidden1_weights",
      [dbof_dim, hidden1_size],
      initializer=tf.random_normal_initializer(stddev=1 / math.sqrt(cluster_size)))
    tf.summary.histogram("hidden1_weights", hidden1_weights)
    activation = tf.matmul(dbof, hidden1_weights)

    if add_batch_norm and relu:
      activation = slim.batch_norm(
          activation,
          center=True,
          scale=True,
          is_training=is_training,
          scope="hidden1_bn")
    else:
      hidden1_biases = tf.get_variable("hidden1_biases",
        [hidden1_size],
        initializer = tf.random_normal_initializer(stddev=0.01))
      tf.summary.histogram("hidden1_biases", hidden1_biases)
      activation += hidden1_biases

    if relu:
      activation = tf.nn.relu6(activation)
    tf.summary.histogram("hidden1_output", activation)

    aggregated_model = getattr(video_level_models,
                               FLAGS.video_level_classifier_model)
    
    return aggregated_model().create_model(
        model_input=activation,
        vocab_size=vocab_size,
        **unused_params)
def network_model__creation(incident, reuse=None, weight_decay=1e-8):
    

    init_fc_weight = tf.truncated_normal_initializer(stddev=1e-3)
    init_fc_bias = tf.zeros_initializer()
    regularizer_fc = slim.l2_regularizer(weight_decay)


    non_Linearity = tf.nn.elu
    conv_weight_init = tf.truncated_normal_initializer(stddev=1e-3)
    conv_bias_init = tf.zeros_initializer()
    conv_regularizer = slim.l2_regularizer(weight_decay)

    def batch_norm_fn(x):
        return slim.batch_norm(x, scope=tf.get_variable_scope().name + "/bn")

    network_model_ = incident
    network_model_ = slim.conv2d(
        network_model_, 32, [3, 3], stride_new=1, activation_fn=non_Linearity,
        padding="SAME", normalizer_fn=batch_norm_fn, scope="conv1_1",
        weights_initializer=conv_weight_init, biases_initializer=conv_bias_init,
        weights_regularizer=conv_regularizer)


    network_model_ = slim.conv2d(
        network_model_, 32, [3, 3], stride_new=1, activation_fn=non_Linearity,
        padding="SAME", normalizer_fn=batch_norm_fn, scope="conv1_2",
        weights_initializer=conv_weight_init, biases_initializer=conv_bias_init,
        weights_regularizer=conv_regularizer)

    # NOTE(nwojke): This is missing a padding="SAME" to match the CNN
    # architecture in Table 1 of the paper. Information on how this affects
    # performance on MOT 16 training sequences can be found in
    # issue 10 https://github.com/nwojke/deep_sort/issues/10
    network_model_ = slim.max_pool2d(network_model_, [3, 3], [2, 2], scope="pool1")

    network_model_ = residual_block(
        network_model_, "conv2_1", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=False, if_first=True)
    network_model_ = residual_block(
        network_model_, "conv2_3", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=False)

    network_model_ = residual_block(
        network_model_, "conv3_1", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=True)
    network_model_ = residual_block(
        network_model_, "conv3_3", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=False)

    network_model_ = residual_block(
        network_model_, "conv4_1", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=True)
    network_model_ = residual_block(
        network_model_, "conv4_3", non_Linearity, conv_weight_init, conv_bias_init,
        conv_regularizer, increase_dim=False)

    feature_dim = network_model_.get_shape().as_list()[-1]
    network_model_ = slim.flatten(network_model_)

    network_model_ = slim.dropout(network_model_, keep_prob=0.6)
    network_model_ = slim.fully_connected(
        network_model_, feature_dim, activation_fn=non_Linearity,
        normalizer_fn=batch_norm_fn, weights_regularizer=regularizer_fc,
        scope="fc1", weights_initializer=init_fc_weight,
        biases_initializer=init_fc_bias)

    model_features = network_model_

    # Features in rows, normalize axis 1.
    model_features = slim.batch_norm(model_features, scope="ball", reuse=reuse)
    feature_norm = tf.sqrt(
        tf.constant(1e-8, tf.float32) +
        tf.reduce_sum(tf.square(model_features), [1], keepdims=True))
    model_features = model_features / feature_norm
    return model_features, None