def apply_gradients(self, grads_tvars, global_step=None, name=None): self._grads, self._tvars = zip(*[(g, t) for g, t in grads_tvars if g is not None]) # for manual gradient clipping if self._clip_thresh_var is not None: self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, self._clip_thresh_var) # loosely adaptive clipping of gradient in case exploding gradient ruins statistics if self._use_adapt_grad_clip: thresh = tf.cond( self._do_tune, lambda: tf.sqrt(self._stat_protect_fac * self. _adapt_grad_clip_thresh**2), lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) with tf.variable_scope("before_apply"): before_apply_op = self.before_apply() with tf.variable_scope("update_hyper"): with tf.control_dependencies([before_apply_op]): update_hyper_op = self.update_hyper_param() with tf.variable_scope("apply_updates"): with tf.control_dependencies([update_hyper_op]): # clip exploding gradient according to h_max if self._use_adapt_grad_clip: thresh = tf.cond( tf.greater(tf.global_norm(self._grads), self._adapt_grad_clip_thresh), lambda: self._adapt_grad_clip_target_val, lambda: tf.to_float(tf.constant(LARGE_FLOAT_VAL))) self._grads, self._grads_norm = tf.clip_by_global_norm( self._grads, thresh) apply_grad_op = self._optimizer.apply_gradients( zip(self._grads, self._tvars), global_step, name) with tf.control_dependencies([apply_grad_op]): self._increment_global_step_op = tf.assign(self._global_step, self._global_step + 1) self._adapt_grad_clip_thresh_op = \ tf.assign(self._adapt_grad_clip_thresh, tf.sqrt(self._h_max) ) self._adapt_grad_clip_target_val_op = \ tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(self._h_max) ) # self._adapt_grad_clip_target_val_op = \ # tf.assign(self._adapt_grad_clip_target_val, tf.sqrt(tf.sqrt(self._h_max * self._h_min))) return tf.group(before_apply_op, update_hyper_op, apply_grad_op, self._adapt_grad_clip_thresh_op, self._adapt_grad_clip_target_val_op, self._increment_global_step_op)
def get_lr_tensor(self): lr = (1.0 - tf.sqrt(self._mu))**2 / (self._h_min + EPS) lr = tf.minimum( lr, lr * (tf.to_float(self._global_step) + 1.0) / 10.0 / tf.to_float(tf.constant(self._curv_win_width))) return lr
def get_cubic_root(self): # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 # where x = sqrt(mu). # We substitute x, which is sqrt(mu), with x = y + 1. # It gives y^3 + py = q # where p = (D^2 h_min^2)/(2*C) and q = -p. # We use the Vieta's substution to compute the root. # There is only one real solution y (which is in [0, 1] ). # http://mathworld.wolfram.com/VietasSubstitution.html # assert_array = \ # [tf.Assert(tf.logical_not(tf.is_nan(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), # tf.Assert(tf.logical_not(tf.is_nan(self._h_min) ), [self._h_min,]), # tf.Assert(tf.logical_not(tf.is_nan(self._grad_var) ), [self._grad_var,]), # tf.Assert(tf.logical_not(tf.is_inf(self._dist_to_opt_avg) ), [self._dist_to_opt_avg,]), # tf.Assert(tf.logical_not(tf.is_inf(self._h_min) ), [self._h_min,]), # tf.Assert(tf.logical_not(tf.is_inf(self._grad_var) ), [self._grad_var,])] # with tf.control_dependencies(assert_array): # EPS in the numerator to prevent momentum being exactly one in case of 0 gradient p = (self._dist_to_opt_avg + EPS)**2 * (self._h_min + EPS)**2 / 2 / (self._grad_var + EPS) w3 = (-tf.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0 w = tf.sign(w3) * tf.pow(tf.abs(w3), 1.0 / 3.0) y = w - p / 3.0 / (w + EPS) x = y + 1 return x
def pooling_layer(self, x, pooling_type=None): ''' Add a pooling layer across the whole utterance. Input: [B, T, D] --> Reduce along T Statistics pooling output: [B, D * 2] Average pooling output: [B, D] ''' assert_rank3 = tf.debugging.assert_rank(x, 3) with tf.control_dependencies([assert_rank3]): x = tf.identity(x) pooling_type = pooling_type if pooling_type else self.netconf[ 'frame_pooling_type'] if pooling_type == 'stats': with tf.name_scope('stats_pooling'): mean, var = tf.nn.moments(x, 1) x = tf.concat([mean, tf.sqrt(var + 1e-6)], 1) elif pooling_type == 'average': with tf.name_scope('average_pooling'): mean, _ = tf.nn.moments(x, 1) x = mean else: raise ValueError('Unsupported frame_pooling_type: %s' % (pooling_type)) assert_rank2 = tf.debugging.assert_rank(x, 2) with tf.control_dependencies([assert_rank2]): x = tf.identity(x) return x
def _freq_feat_graph(feat_name, **kwargs): winlen = kwargs.get('winlen') winstep = kwargs.get('winstep') feature_size = kwargs.get('feature_size') sr = kwargs.get('sr') #pylint: disable=invalid-name nfft = kwargs.get('nfft') del nfft assert feat_name in ('fbank', 'spec') params = speech_ops.speech_params( sr=sr, bins=feature_size, add_delta_deltas=False, audio_frame_length=winlen, audio_frame_step=winstep) graph = None if feat_name == 'fbank': # get session if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): # fbank filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) del sample_rate fbank = speech_ops.extract_feature(waveforms, params) # shape must be [T, D, C] feat = tf.identity(fbank, name=feat_name) elif feat_name == 'spec': # magnitude spec if feat_name not in _global_sess: graph = tf.Graph() #pylint: disable=not-context-manager with graph.as_default(): filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath') waveforms, sample_rate = speech_ops.read_wav(filepath, params) spec = py_x_ops.spectrum( waveforms[:, 0], tf.cast(sample_rate, tf.dtypes.float32), output_type=1) #output_type: 1, power spec; 2 log power spec spec = tf.sqrt(spec) # shape must be [T, D, C] spec = tf.expand_dims(spec, -1) feat = tf.identity(spec, name=feat_name) else: raise ValueError(f"Not support freq feat: {feat_name}.") return graph, (_get_out_tensor_name('wavpath', 0), _get_out_tensor_name(feat_name, 0))
def dist_to_opt(self): dist_to_opt_ops = [] # running average of the norm of gradeint self._grad_norm = tf.sqrt(self._grad_norm_squared) avg_op = self._moving_averager.apply([ self._grad_norm, ]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_norm_avg = self._moving_averager.average( self._grad_norm) # single iteration distance estimation # note that self._grad_norm_avg is per variable self._dist_to_opt = (self._grad_norm_avg / (self._grad_norm_squared_avg + EPS)) # running average of distance avg_op = self._moving_averager.apply([self._dist_to_opt]) dist_to_opt_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._dist_to_opt_avg = tf.identity( self._moving_averager.average(self._dist_to_opt)) if self._sparsity_debias: self._dist_to_opt_avg /= (tf.sqrt(self._sparsity_avg) + EPS) return dist_to_opt_ops
def get_mu_tensor(self): root = self.get_cubic_root() dr = tf.maximum((self._h_max + EPS) / (self._h_min + EPS), 1.0 + EPS) mu = tf.maximum(root**2, ((tf.sqrt(dr) - 1) / (tf.sqrt(dr) + 1))**2) return mu
def arcface_loss(embedding, labels, out_num, weights=None, s=64., m=0.5, limit_to_pi=True): ''' https://github.com/auroua/InsightFace_TF/blob/master/losses/face_losses.py :param embedding: the input embedding vectors :param labels: the input labels, the shape should be eg: (batch_size, 1) :param s: scalar value default is 64 :param out_num: output class num :param weights: a tf.variable with shape (embedding.shape[-1], out_num) or None to make a new one internally. default = None :param m: the margin value, default is 0.5 :return: the final cacualted output, this output is send into the tf.nn.softmax directly ''' cos_m = math.cos(m) sin_m = math.sin(m) mm = sin_m * m # issue 1 threshold = math.cos(math.pi - m) with tf.variable_scope('arcface_loss'): # inputs and weights norm embedding_norm = tf.norm(embedding, axis=1, keep_dims=True) embedding = tf.div(embedding, embedding_norm, name='norm_embedding') if weights is None: weights = tf.get_variable( name='weights', shape=[embedding.shape[-1].value, out_num], initializer=tf.initializer.glorot_unifrom()) weights_norm = tf.norm(weights, axis=0, keep_dims=True) weights = tf.div(weights, weights_norm, name='norm_weights') # cos(theta+m) cos_t = tf.matmul(embedding, weights, name='cos_t') cos_t2 = tf.square(cos_t, name='cos_2') sin_t2 = tf.subtract(1., cos_t2, name='sin_2') sin_t = tf.sqrt(sin_t2, name='sin_t') cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m), tf.multiply(sin_t, sin_m), name='cos_mt') if limit_to_pi: # this condition controls the theta+m should in range [0, pi] # 0<=theta+m<=pi # -m<=theta<=pi-m cond_v = cos_t - threshold cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool) keep_val = s * (cos_t - mm) cos_mt_temp = tf.where(cond, cos_mt, keep_val) else: cos_mt_temp = cos_mt mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask') # mask = tf.squeeze(mask, 1) inv_mask = tf.subtract(1., mask, name='inverse_mask') s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t') output = tf.add(tf.multiply(s_cos_t, inv_mask), tf.multiply(cos_mt_temp, mask), name='arcface_loss_output') return output
def call(self, inputs, training=None, mask=None): query, key, value = self._unpack(inputs) query_mask, key_mask, _ = self._unpack(mask) batch_size = tf.shape(query)[0] dimension_query = query.get_shape().as_list()[-1] seq_len = tf.shape(query)[-2] key_len = tf.shape(key)[-2] feature_dim = tf.shape(value)[-1] query = tf.matmul( query, tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1])) key = tf.matmul( key, tf.tile(tf.expand_dims(self.kernel_key, 0), [batch_size, 1, 1])) value = tf.matmul( value, tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1])) if self.use_bias: query += self.b_query key += self.b_key value += self.b_value def _reshape_multihead(origin_input): """ reshape for multi head Input shape: (Batch size, steps, features) Output shape: (Batch size * head num, steps, features // head num) """ return tf.concat(tf.split(origin_input, self.head_num, axis=2), axis=0) def _reshape_mask(mask): """ repeat mask for multi head Input shape: (Batch size, steps) Output shape: (Batch size * head num, steps) """ if mask is None: return None seq_len = tf.shape(mask)[1] mask = tf.expand_dims(mask, axis=1) mask = tf.tile(mask, [1, self.head_num, 1]) return tf.reshape(mask, shape=(-1, seq_len)) query_ = _reshape_multihead(query) key_ = _reshape_multihead(key) value_ = _reshape_multihead(value) key_mask = _reshape_mask(key_mask) # (Batch size * head num, query steps, key steps) similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1])) # scale similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32)) if self.sequence_mask: ones = tf.ones((seq_len, key_len)) similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9 if key_mask is not None: similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2), tf.float32)) * 1e9 attention_weights = tf.keras.activations.softmax(similaritys) attention_outputs = tf.matmul(attention_weights, value_) attention_outputs = tf.reshape( attention_outputs, (-1, self.head_num, seq_len, feature_dim // self.head_num)) attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3]) attention_outputs = tf.reshape(attention_outputs, (-1, seq_len, feature_dim)) attention_outputs = tf.matmul( attention_outputs, tf.tile(tf.expand_dims(self.kernel_project, 0), [batch_size, 1, 1])) if self.use_bias: attention_outputs += self.b_project if self.activation is not None: attention_outputs = self.activation(attention_outputs) if query_mask is not None: attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1), tf.float32) return attention_outputs