def _finish(self, caches): """ """ if self.clip > 0: S_t = [cache['s_t'] for cache in caches] S_t, _ = tf.clip_by_global_norm(S_t, self.clip) for cache, s_t in zip(caches, S_t): cache['s_t'] = s_t for cache in caches: x_tm1 = cache['x_tm1'] s_t = cache['s_t'] updates = cache['updates'] with tf.name_scope('update_' + x_tm1.op.name), tf.device( x_tm1.device): if 'idxs' in cache: idxs = cache['idxs'] x_t = tf.scatter_sub(x_tm1, idxs, s_t) if self.chi > 0: x_t_ = tf.gather(x_t, idxs) x_bar_t, t_x_bar = self._sparse_moving_average( x_tm1, idxs, x_t_, 'x', beta=self.chi) else: x_t = tf.assign_sub(x_tm1, s_t) if self.chi > 0: x_bar_t, t_x_bar = self._dense_moving_average( x_tm1, x_t, 'x', beta=self.chi) updates.append(x_t) if self.chi > 0: updates.extend([x_bar_t, t_x_bar]) update_ops = [tf.group(*cache['updates']) for cache in caches] return tf.group(*update_ops, name='update')
def _center_loss_func(labels, features, alpha, num_classes, centers, feature_dim): assert feature_dim == features.get_shape()[1] labels = K.reshape(labels, [-1]) #labels = K.argmax(labels, axis=1) labels = tf.to_int32(labels) centers_batch = K.gather(centers, labels) diff = (1 - alpha) * (centers_batch - features) centers = tf.scatter_sub(centers, labels, diff) centers_batch = K.gather(centers, labels) loss = K.mean(K.square(features - centers_batch)) return loss
def center_loss(features, label, alfa, nrof_classes): """Center loss based on the paper "A Discriminative Feature Learning Approach for Deep Face Recognition" (http://ydwen.github.io/papers/WenECCV16.pdf) """ nrof_features = features.get_shape()[1] centers = tf.get_variable('centers', [nrof_classes, nrof_features], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) label = tf.reshape(label, [-1]) centers_batch = tf.gather(centers, label) diff = (1 - alfa) * (centers_batch - features) centers = tf.scatter_sub(centers, label, diff) loss = tf.reduce_mean(tf.square(features - centers_batch)) return loss, centers
def center_loss(features, label, alfa, nrof_classes): nrof_features = features.get_shape()[1] centers = tf.get_variable('centers', [nrof_classes, nrof_features], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) label = tf.reshape(label, [-1]) centers_batch = tf.gather(centers, label) diff = (1 - alfa) * (centers_batch - features) centers = tf.scatter_sub(centers, label, diff) with tf.control_dependencies([centers]): loss = tf.reduce_mean(tf.square(features - centers_batch)) return loss, centers
def _apply_sparse_shared(self, grad_values, grad_indices, var): shape = np.array(var.get_shape()) var_rank = len(shape) # For sparse case, we only update the accumulator representing the sparse # dimension. In this case SM3 is similar to isotropic adagrad but with # better bound (due to the max operator). # # We do not use the column accumulator because it will updated for # every gradient step and will significantly overestimate the gradient # square. While, the row accumulator can take advantage of the sparsity # in the gradients. Even if one implements the column accumulator - it # will result in a no-op because the row accumulators will have lower # values. # # Note that: We do not run this code paths for our experiments in our paper # as on TPU all the sparse gradients are densified. if var_rank > 1: accumulator_var = self.get_slot(var, "accumulator_" + str(0)) accumulator = tf.gather(accumulator_var, grad_indices) shape_for_broadcasting = tf.concat( [[tf.shape(accumulator)[0]], [1] * (var_rank - 1)], 0) accumulator = tf.reshape(accumulator, shape_for_broadcasting) accumulator += grad_values * grad_values else: accumulator_var = self.get_slot(var, "accumulator") accumulator = tf.scatter_add(accumulator_var, grad_indices, grad_values * grad_values) accumulator_inv_sqrt = tf.rsqrt(accumulator + 1e-30) scaled_g = (grad_values * accumulator_inv_sqrt) updates = [] with tf.control_dependencies([scaled_g]): if var_rank > 1: axes = list(range(1, var_rank)) new_accumulator = tf.reduce_max(accumulator, axis=axes) updates = [ tf.scatter_update(accumulator_var, grad_indices, new_accumulator) ] with tf.control_dependencies(updates): return tf.scatter_sub(var, grad_indices, self._learning_rate_tensor * scaled_g)
def center_loss(labels, features, alpha=ALPHA, num_classes=NUM_CLASSES): """ 获取center loss及更新样本的center :param labels: Tensor,表征样本label,非one-hot编码,shape应为(batch_size,). :param features: Tensor,表征样本特征,最后一个fc层的输出,shape应该为(batch_size, num_classes). :param alpha: 0-1之间的数字,控制样本类别中心的学习率,细节参考原文. :param num_classes: 整数,表明总共有多少个类别,网络分类输出有多少个神经元这里就取多少. :return: Tensor, center-loss, shape因为(batch_size,) """ # 获取特征的维数,例如256维 len_features = features.get_shape()[1] # 建立一个Variable,shape为[num_classes, len_features],用于存储整个网络的样本中心, # 设置trainable=False是因为样本中心不是由梯度进行更新的 centers = tf.get_variable('centers', [num_classes, len_features], dtype=tf.float32, initializer=tf.constant_initializer(0), trainable=False) # 将label展开为一维的,如果labels已经是一维的,则该动作其实无必要 labels = tf.reshape(labels, [-1]) # 根据样本label,获取mini-batch中每一个样本对应的中心值 centers_batch = tf.gather(centers, labels) # 当前mini-batch的特征值与它们对应的中心值之间的差 diff = centers_batch - features # 获取mini-batch中同一类别样本出现的次数,了解原理请参考原文公式(4) unique_label, unique_idx, unique_count = tf.unique_with_counts(labels) appear_times = tf.gather(unique_count, unique_idx) appear_times = tf.reshape(appear_times, [-1, 1]) diff = diff / tf.cast((1 + appear_times), tf.float32) diff = alpha * diff # 更新centers centers_update_op = tf.scatter_sub(centers, labels, diff) # 这里使用tf.control_dependencies更新centers with tf.control_dependencies([centers_update_op]): # 计算center-loss c_loss = tf.nn.l2_loss(features - centers_batch) return c_loss
def __init__(self, n_sample, minibatch_sz, m1_inp_shape, m2_inp_shape, m1_layers, m2_layers, msi_layers, m1_cause_init, m2_cause_init, msi_cause_init, reg_m1_causes, reg_m2_causes, reg_msi_causes, lr_m1_causes, lr_m2_causes, lr_msi_causes, reg_m1_filters, reg_m2_filters, reg_msi_filters, lr_m1_filters, lr_m2_filters, lr_msi_filters): self.m1_inp_shape = m1_inp_shape self.m2_inp_shape = m2_inp_shape self.m1_layers = m1_layers self.m2_layers = m2_layers self.msi_layers = msi_layers # create placeholders self.x_m1 = tf.placeholder(tf.float32, shape=[minibatch_sz, m1_inp_shape]) self.x_m2 = tf.placeholder(tf.float32, shape=[minibatch_sz, m2_inp_shape]) self.batch = tf.placeholder(tf.int32, shape=[]) # create filters and cause for m1 self.m1_filters = [] self.m1_causes = [] for i in range(len(self.m1_layers)): filter_name = 'm1_filter_%d' % i cause_name = 'm1_cause_%d' % i if i == 0: self.m1_filters += [ tf.get_variable( filter_name, shape=[self.m1_layers[i], self.m1_inp_shape]) ] else: self.m1_filters += [ tf.get_variable( filter_name, shape=[self.m1_layers[i], self.m1_layers[i - 1]]) ] init = tf.constant_initializer(m1_cause_init[i]) self.m1_causes += [ tf.get_variable(cause_name, shape=[n_sample, self.m1_layers[i]], initializer=init) ] # create filters and cause for m2 self.m2_filters = [] self.m2_causes = [] for i in range(len(self.m2_layers)): filter_name = 'm2_filter_%d' % i cause_name = 'm2_cause_%d' % i if i == 0: self.m2_filters += [ tf.get_variable( filter_name, shape=[self.m2_layers[i], self.m2_inp_shape]) ] else: self.m2_filters += [ tf.get_variable( filter_name, shape=[self.m2_layers[i], self.m2_layers[i - 1]]) ] init = tf.constant_initializer(m2_cause_init[i]) self.m2_causes += [ tf.get_variable(cause_name, shape=[n_sample, self.m2_layers[i]], initializer=init) ] # create filters and cause for msi self.msi_filters = [] self.msi_causes = [] for i in range(len(self.msi_layers)): if i == 0: # add filters for m1 filter_name = 'msi_m1_filter' self.msi_filters += [ tf.get_variable( filter_name, shape=[self.msi_layers[i], self.m1_layers[-1]]) ] # add filters for m2 filter_name = 'msi_m2_filter' self.msi_filters += [ tf.get_variable( filter_name, shape=[self.msi_layers[i], self.m2_layers[-1]]) ] else: filter_name = 'msi_filter_%d' % i self.msi_filters += [ tf.get_variable( filter_name, shape=[self.msi_layers[i], self.msi_layers[i - 1]]) ] cause_name = 'msi_cause_%d' % i init = tf.constant_initializer(msi_cause_init[i]) self.msi_causes += [ tf.get_variable(cause_name, shape=[n_sample, self.msi_layers[i]], initializer=init) ] # compute predictions current_batch = tf.range(self.batch * minibatch_sz, (self.batch + 1) * minibatch_sz) # m1 predictions self.m1_minibatch = [] self.m1_predictions = [] for i in range(len(self.m1_layers)): self.m1_minibatch += [ tf.gather(self.m1_causes[i], indices=current_batch, axis=0) ] self.m1_predictions += [ tf.nn.leaky_relu( tf.matmul(self.m1_minibatch[i], self.m1_filters[i])) ] # m2 predictions self.m2_minibatch = [] self.m2_predictions = [] for i in range(len(self.m2_layers)): self.m2_minibatch += [ tf.gather(self.m2_causes[i], indices=current_batch, axis=0) ] self.m2_predictions += [ tf.nn.leaky_relu( tf.matmul(self.m2_minibatch[i], self.m2_filters[i])) ] # msi predictions self.msi_minibatch = [] self.msi_predictions = [] for i in range(len(self.msi_layers)): self.msi_minibatch += [ tf.gather(self.msi_causes[i], indices=current_batch, axis=0) ] if i == 0: self.msi_predictions += [ tf.nn.leaky_relu( tf.matmul(self.msi_minibatch[i], self.msi_filters[i])) ] # m1 prediction self.msi_predictions += [ tf.nn.leaky_relu( tf.matmul(self.msi_minibatch[i], self.msi_filters[i + 1])) ] # m2 prediction else: self.msi_predictions += [ tf.nn.leaky_relu( tf.matmul(self.msi_minibatch[i], self.msi_filters[i + 1])) ] # add ops for computing gradients for m1 causes and for updating weights self.m1_bu_error = [] self.m1_update_filter = [] self.m1_cause_grad = [] for i in range(len(self.m1_layers)): if i == 0: self.m1_bu_error += [ tf.losses.mean_squared_error( self.x_m1, self.m1_predictions[i], reduction=tf.losses.Reduction.NONE) ] else: self.m1_bu_error += [ tf.losses.mean_squared_error( tf.stop_gradient(self.m1_minibatch[i - 1]), self.m1_predictions[i], reduction=tf.losses.Reduction.NONE) ] # compute top-down prediction error if len(self.m1_layers) > (i + 1): # there are more layers in this modality td_error = tf.losses.mean_squared_error( tf.stop_gradient(self.m1_predictions[i + 1]), self.m1_minibatch[i], reduction=tf.losses.Reduction.NONE) else: # this is the only layer in this modality td_error = tf.losses.mean_squared_error( tf.stop_gradient(self.msi_predictions[0]), self.m1_minibatch[i], reduction=tf.losses.Reduction.NONE) reg_error = reg_m1_causes[i] * (self.m1_minibatch[i]**2) # reg_error = tf.keras.regularizers.l2(reg_m1_causes[i])(self.m1_minibatch[i]) self.m1_cause_grad += [ tf.gradients([self.m1_bu_error[i], td_error, reg_error], self.m1_minibatch[i])[0] ] # ops for updating weights reg_error = reg_m1_filters[i] * (self.m1_filters[i]**2) m1_filter_grad = tf.gradients([self.m1_bu_error[i], reg_error], self.m1_filters[i])[0] self.m1_update_filter += [ tf.assign_sub(self.m1_filters[i], lr_m1_filters[i] * m1_filter_grad) ] # add ops for computing gradients for m2 causes and for updating weights self.m2_bu_error = [] self.m2_update_filter = [] self.m2_cause_grad = [] for i in range(len(self.m2_layers)): if i == 0: self.m2_bu_error += [ tf.losses.mean_squared_error( self.x_m2, self.m2_predictions[i], reduction=tf.losses.Reduction.NONE) ] else: self.m2_bu_error += [ tf.losses.mean_squared_error( tf.stop_gradient(self.m2_minibatch[i - 1]), self.m2_predictions[i], reduction=tf.losses.Reduction.NONE) ] # compute top-down prediction error if len(self.m2_layers) > (i + 1): # there are more layers in this modality td_error = tf.losses.mean_squared_error( tf.stop_gradient(self.m2_predictions[i + 1]), self.m2_minibatch[i], reduction=tf.losses.Reduction.NONE) else: # this is the only layer in this modality td_error = tf.losses.mean_squared_error( tf.stop_gradient(self.msi_predictions[1]), self.m2_minibatch[i], reduction=tf.losses.Reduction.NONE) reg_error = reg_m2_causes[i] * (self.m2_minibatch[i]**2) # reg_error = tf.keras.regularizers.l2(reg_m2_causes[i])(self.m2_minibatch[i]) self.m2_cause_grad += [ tf.gradients([self.m2_bu_error[i], td_error, reg_error], self.m2_minibatch[i])[0] ] # add ops for updating weights reg_error = reg_m2_filters[i] * (self.m2_filters[i]**2) m2_filter_grad = tf.gradients([self.m2_bu_error[i], reg_error], self.m2_filters[i])[0] self.m1_update_filter += [ tf.assign_sub(self.m2_filters[i], lr_m2_filters[i] * m2_filter_grad) ] #else: #raise NotImplementedError # add ops for computing gradients for msi causes self.msi_bu_error = [] self.msi_reg_error = [] self.msi_update_filter = [] self.msi_cause_grad = [] for i in range(len(self.msi_layers)): if i == 0: self.msi_bu_error += [ tf.losses.mean_squared_error( tf.stop_gradient(self.m1_minibatch[-1]), self.msi_predictions[i], reduction=tf.losses.Reduction.NONE) ] self.msi_bu_error += [ tf.losses.mean_squared_error( tf.stop_gradient(self.m2_minibatch[-1]), self.msi_predictions[i + 1], reduction=tf.losses.Reduction.NONE) ] self.msi_reg_error += [ reg_msi_causes[i] * (self.msi_minibatch[i]**2) ] # self.msi_reg_error += [tf.keras.regularizers.l2(reg_msi_causes[i])(self.msi_minibatch[i])] if len(self.msi_layers) > 1: raise NotImplementedError else: self.msi_cause_grad += [ tf.gradients([ self.msi_bu_error[i], self.msi_bu_error[i + 1], self.msi_reg_error[i] ], self.msi_minibatch[i])[0] ] # add ops for updating weights reg_error = reg_msi_filters[i] * (self.msi_filters[i]**2) msi_filter_grad = tf.gradients( [self.msi_bu_error[i], reg_error], self.msi_filters[i])[0] self.msi_update_filter += [ tf.assign_sub(self.msi_filters[i], lr_msi_filters[i] * msi_filter_grad) ] reg_error = reg_msi_filters[i + 1] * (self.msi_filters[i + 1]** 2) msi_filter_grad = tf.gradients( [self.msi_bu_error[i + 1], reg_error], self.msi_filters[i + 1])[0] self.msi_update_filter += [ tf.assign_sub(self.msi_filters[i + 1], lr_msi_filters[i + 1] * msi_filter_grad) ] else: raise NotImplementedError # add ops for updating causes self.m1_update_cause = [] self.m2_update_cause = [] self.msi_update_cause = [] with tf.control_dependencies(self.m1_cause_grad + self.m2_cause_grad + self.msi_cause_grad): # m1 modality for i in range(len(self.m1_layers)): self.m1_update_cause += [ tf.scatter_sub(self.m1_causes[i], indices=current_batch, updates=(lr_m1_causes[i] * self.m1_cause_grad[i])) ] # m2 modality for i in range(len(self.m2_layers)): self.m2_update_cause += [ tf.scatter_sub(self.m2_causes[i], indices=current_batch, updates=(lr_m2_causes[i] * self.m2_cause_grad[i])) ] # msi modality for i in range(len(self.msi_layers)): self.msi_update_cause += [ tf.scatter_sub(self.msi_causes[i], indices=current_batch, updates=(lr_msi_causes[i] * self.msi_cause_grad[i])) ]