Exemple #1
0
    def apply_gradients(self, grads_tvars, global_step=None, name=None):
        self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars
                                         if g is not None])

        # for manual gradient clipping
        if self._clip_thresh_var is not None:
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, self._clip_thresh_var)

        # loosely adaptive clipping of gradient in case exploding gradient ruins statistics
        if self._use_adapt_grad_clip:
            thresh = tf.cond(
                self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self.
                                               _adapt_grad_clip_thresh**2),
                lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
            self._grads, self._grads_norm = tf.clip_by_global_norm(
                self._grads, thresh)

        with tf.variable_scope("before_apply"):
            before_apply_op = self.before_apply()

        with tf.variable_scope("update_hyper"):
            with tf.control_dependencies([before_apply_op]):
                update_hyper_op = self.update_hyper_param()

        with tf.variable_scope("apply_updates"):
            with tf.control_dependencies([update_hyper_op]):

                # clip exploding gradient according to h_max
                if self._use_adapt_grad_clip:
                    thresh = tf.cond(
                        tf.greater(tf.global_norm(self._grads),
                                   self._adapt_grad_clip_thresh),
                        lambda: self._adapt_grad_clip_target_val,
                        lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL)))
                    self._grads, self._grads_norm = tf.clip_by_global_norm(
                        self._grads, thresh)

                apply_grad_op = self._optimizer.apply_gradients(
                    zip(self._grads, self._tvars), global_step, name)

        with tf.control_dependencies([apply_grad_op]):
            self._increment_global_step_op = tf.assign(self._global_step,
                                                       self._global_step + 1)

            self._adapt_grad_clip_thresh_op = \
              tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) )
            self._adapt_grad_clip_target_val_op = \
              tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) )
            # self._adapt_grad_clip_target_val_op = \
            #   tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min)))

        return tf.group(before_apply_op, update_hyper_op, apply_grad_op,
                        self._adapt_grad_clip_thresh_op,
                        self._adapt_grad_clip_target_val_op,
                        self._increment_global_step_op)
Exemple #2
0
 def get_lr_tensor(self):
     lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS)
     lr = tf.minimum(
         lr,
         lr * (tf.to_float(self._global_step) + 1.0) / 10.0 /
         tf.to_float(tf.constant(self._curv_win_width)))
     return lr
Exemple #3
0
 def get_cubic_root(self):
     # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2
     # where x = sqrt(mu).
     # We substitute x, which is sqrt(mu), with x = y + 1.
     # It gives y^3 + py = q
     # where p = (D^2 h_min^2)/(2*C) and q = -p.
     # We use the Vieta's substution to compute the root.
     # There is only one real solution y (which is in [0, 1] ).
     # http://mathworld.wolfram.com/VietasSubstitution.html
     # assert_array = \
     #   [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
     #   tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]),
     #   tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]),
     #   tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]),
     #   tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]),
     #   tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])]
     # with tf.control_dependencies(assert_array):
     # EPS in the numerator to prevent momentum being exactly one in case of 0 gradient
     p = (self._dist_to_opt_avg +
          EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS)
     w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0
     w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0 / 3.0)
     y = w - p / 3.0 / (w + EPS)
     x = y + 1
     return x
Exemple #4
0
    def pooling_layer(self, x, pooling_type=None):
        '''
      Add a pooling layer across the whole utterance.
      Input: [B, T, D]
        --> Reduce along T

      Statistics pooling output: [B, D * 2]
      Average pooling output: [B, D]
    '''
        assert_rank3 = tf.debugging.assert_rank(x, 3)
        with tf.control_dependencies([assert_rank3]):
            x = tf.identity(x)

        pooling_type = pooling_type if pooling_type else self.netconf[
            'frame_pooling_type']
        if pooling_type == 'stats':
            with tf.name_scope('stats_pooling'):
                mean, var = tf.nn.moments(x, 1)
                x = tf.concat([mean, tf.sqrt(var + 1e-6)], 1)
        elif pooling_type == 'average':
            with tf.name_scope('average_pooling'):
                mean, _ = tf.nn.moments(x, 1)
                x = mean
        else:
            raise ValueError('Unsupported frame_pooling_type: %s' %
                             (pooling_type))

        assert_rank2 = tf.debugging.assert_rank(x, 2)
        with tf.control_dependencies([assert_rank2]):
            x = tf.identity(x)

        return x
Exemple #5
0
def _freq_feat_graph(feat_name, **kwargs):
  winlen = kwargs.get('winlen')
  winstep = kwargs.get('winstep')
  feature_size = kwargs.get('feature_size')
  sr = kwargs.get('sr')  #pylint: disable=invalid-name
  nfft = kwargs.get('nfft')
  del nfft

  assert feat_name in ('fbank', 'spec')

  params = speech_ops.speech_params(
      sr=sr,
      bins=feature_size,
      add_delta_deltas=False,
      audio_frame_length=winlen,
      audio_frame_step=winstep)

  graph = None
  if feat_name == 'fbank':
    # get session
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        # fbank
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)
        del sample_rate
        fbank = speech_ops.extract_feature(waveforms, params)
        # shape must be [T, D, C]
        feat = tf.identity(fbank, name=feat_name)
  elif feat_name == 'spec':
    # magnitude spec
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)

        spec = py_x_ops.spectrum(
            waveforms[:, 0],
            tf.cast(sample_rate, tf.dtypes.float32),
            output_type=1)  #output_type: 1, power spec; 2 log power spec
        spec = tf.sqrt(spec)
        # shape must be [T, D, C]
        spec = tf.expand_dims(spec, -1)
        feat = tf.identity(spec, name=feat_name)
  else:
    raise ValueError(f"Not support freq feat: {feat_name}.")

  return graph, (_get_out_tensor_name('wavpath',
                                      0), _get_out_tensor_name(feat_name, 0))
Exemple #6
0
 def dist_to_opt(self):
     dist_to_opt_ops = []
     # running average of the norm of gradeint
     self._grad_norm = tf.sqrt(self._grad_norm_squared)
     avg_op = self._moving_averager.apply([
         self._grad_norm,
     ])
     dist_to_opt_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._grad_norm_avg = self._moving_averager.average(
             self._grad_norm)
         # single iteration distance estimation
         # note that self._grad_norm_avg is per variable
         self._dist_to_opt = (self._grad_norm_avg /
                              (self._grad_norm_squared_avg + EPS))
     # running average of distance
     avg_op = self._moving_averager.apply([self._dist_to_opt])
     dist_to_opt_ops.append(avg_op)
     with tf.control_dependencies([avg_op]):
         self._dist_to_opt_avg = tf.identity(
             self._moving_averager.average(self._dist_to_opt))
         if self._sparsity_debias:
             self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS)
     return dist_to_opt_ops
Exemple #7
0
 def get_mu_tensor(self):
     root = self.get_cubic_root()
     dr = tf.maximum((self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS)
     mu = tf.maximum(root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2)
     return mu
Exemple #8
0
def arcface_loss(embedding,
                 labels,
                 out_num,
                 weights=None,
                 s=64.,
                 m=0.5,
                 limit_to_pi=True):
    '''
  https://github.com/auroua/InsightFace_TF/blob/master/losses/face_losses.py
  :param embedding: the input embedding vectors
  :param labels:  the input labels, the shape should be eg: (batch_size, 1)
  :param s: scalar value default is 64
  :param out_num: output class num
  :param weights: a tf.variable with shape (embedding.shape[-1], out_num)
                  or None to make a new one internally. default = None
  :param m: the margin value, default is 0.5
  :return: the final cacualted output, this output is send into the tf.nn.softmax directly
  '''
    cos_m = math.cos(m)
    sin_m = math.sin(m)
    mm = sin_m * m  # issue 1
    threshold = math.cos(math.pi - m)
    with tf.variable_scope('arcface_loss'):
        # inputs and weights norm
        embedding_norm = tf.norm(embedding, axis=1, keep_dims=True)
        embedding = tf.div(embedding, embedding_norm, name='norm_embedding')
        if weights is None:
            weights = tf.get_variable(
                name='weights',
                shape=[embedding.shape[-1].value, out_num],
                initializer=tf.initializer.glorot_unifrom())
        weights_norm = tf.norm(weights, axis=0, keep_dims=True)
        weights = tf.div(weights, weights_norm, name='norm_weights')
        # cos(theta+m)
        cos_t = tf.matmul(embedding, weights, name='cos_t')
        cos_t2 = tf.square(cos_t, name='cos_2')
        sin_t2 = tf.subtract(1., cos_t2, name='sin_2')
        sin_t = tf.sqrt(sin_t2, name='sin_t')
        cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m),
                                 tf.multiply(sin_t, sin_m),
                                 name='cos_mt')

        if limit_to_pi:
            # this condition controls the theta+m should in range [0, pi]
            #      0<=theta+m<=pi
            #     -m<=theta<=pi-m
            cond_v = cos_t - threshold
            cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool)

            keep_val = s * (cos_t - mm)
            cos_mt_temp = tf.where(cond, cos_mt, keep_val)
        else:
            cos_mt_temp = cos_mt

        mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask')
        # mask = tf.squeeze(mask, 1)
        inv_mask = tf.subtract(1., mask, name='inverse_mask')

        s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t')

        output = tf.add(tf.multiply(s_cos_t, inv_mask),
                        tf.multiply(cos_mt_temp, mask),
                        name='arcface_loss_output')
    return output
Exemple #9
0
    def call(self, inputs, training=None, mask=None):

        query, key, value = self._unpack(inputs)

        query_mask, key_mask, _ = self._unpack(mask)

        batch_size = tf.shape(query)[0]
        dimension_query = query.get_shape().as_list()[-1]
        seq_len = tf.shape(query)[-2]
        key_len = tf.shape(key)[-2]
        feature_dim = tf.shape(value)[-1]

        query = tf.matmul(
            query,
            tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1]))
        key = tf.matmul(
            key, tf.tile(tf.expand_dims(self.kernel_key, 0),
                         [batch_size, 1, 1]))
        value = tf.matmul(
            value,
            tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1]))
        if self.use_bias:
            query += self.b_query
            key += self.b_key
            value += self.b_value

        def _reshape_multihead(origin_input):
            """
      reshape for multi head
        Input shape: (Batch size, steps, features)
        Output shape: (Batch size * head num, steps, features // head num)
      """
            return tf.concat(tf.split(origin_input, self.head_num, axis=2),
                             axis=0)

        def _reshape_mask(mask):
            """
      repeat mask for multi head
        Input shape: (Batch size, steps)
        Output shape: (Batch size * head num, steps)
      """
            if mask is None:
                return None
            seq_len = tf.shape(mask)[1]
            mask = tf.expand_dims(mask, axis=1)
            mask = tf.tile(mask, [1, self.head_num, 1])
            return tf.reshape(mask, shape=(-1, seq_len))

        query_ = _reshape_multihead(query)
        key_ = _reshape_multihead(key)
        value_ = _reshape_multihead(value)

        key_mask = _reshape_mask(key_mask)

        # (Batch size * head num, query steps, key steps)
        similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1]))
        # scale
        similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32))
        if self.sequence_mask:
            ones = tf.ones((seq_len, key_len))
            similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9
        if key_mask is not None:
            similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2),
                                          tf.float32)) * 1e9

        attention_weights = tf.keras.activations.softmax(similaritys)
        attention_outputs = tf.matmul(attention_weights, value_)
        attention_outputs = tf.reshape(
            attention_outputs,
            (-1, self.head_num, seq_len, feature_dim // self.head_num))
        attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3])
        attention_outputs = tf.reshape(attention_outputs,
                                       (-1, seq_len, feature_dim))

        attention_outputs = tf.matmul(
            attention_outputs,
            tf.tile(tf.expand_dims(self.kernel_project, 0),
                    [batch_size, 1, 1]))
        if self.use_bias:
            attention_outputs += self.b_project
        if self.activation is not None:
            attention_outputs = self.activation(attention_outputs)

        if query_mask is not None:
            attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1),
                                         tf.float32)

        return attention_outputs