def train_mode(): xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta, epsilon=eps, is_training=True) moving_sigma = tf.sqrt(moving_var, 'sigma') r = tf.stop_gradient( tf.clip_by_value(tf.sqrt(batch_var / moving_var), 1.0 / rmax, rmax)) d = tf.stop_gradient( tf.clip_by_value((batch_mean - moving_mean) / moving_sigma, -dmax, dmax)) xn = xn * r + d #update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay) update_op1 = moving_averages.assign_moving_average(moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average(moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='bn')
def freeze_affine_getter(getter, *args, **kwargs): # custom getter to freeze affine params inside bn name = args[0] if len(args) else kwargs.get('name') if name.endswith('/gamma') or name.endswith('/beta'): kwargs['trainable'] = False ret = getter(*args, **kwargs) add_model_variable(ret) else: ret = getter(*args, **kwargs) return ret
def custom_getter(getter, *args, **kwargs): trainable = kwargs.get('trainable', True) name = args[0] if len(args) else kwargs.get('name') if skip_collection: kwargs['trainable'] = False v = getter(*args, **kwargs) if skip_collection: add_model_variable(v) if trainable and stop_gradient: v = tf.stop_gradient(v, name='freezed_' + name) return v
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output')
def update_bn_ema(output, batch_mean, batch_var, moving_mean, moving_var, decay): from tensorflow.contrib.framework import add_model_variable from tensorflow.python.training import moving_averages update_op1 = moving_averages.assign_moving_average(moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average(moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. tf.compat.v1.add_to_collections(tf.compat.v1.GraphKeys.UPDATE_OPS, update_op1) tf.compat.v1.add_to_collections(tf.compat.v1.GraphKeys.UPDATE_OPS, update_op2) return tf.identity(output)
def _update_bn_ema(_xn, _batch_mean, _batch_var, _moving_mean, _moving_var, _decay): _update_op1 = moving_averages.assign_moving_average( _moving_mean, _batch_mean, _decay, zero_debias=False, name='mean_ema_op') _update_op2 = moving_averages.assign_moving_average( _moving_var, _batch_var, _decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. with tf.control_dependencies([_update_op1, _update_op2]): return tf.identity(xn, name='output')
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output')
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): from tensorflow.contrib.framework import add_model_variable # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output')
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') # Only add to model var when we update them add_model_variable(moving_mean) add_model_variable(moving_var) # TODO add an option, and maybe enable it for replica mode? # with tf.control_dependencies([update_op1, update_op2]): # return tf.identity(xn, name='output') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) return xn
def update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay): # TODO is there a way to use zero_debias in multi-GPU? update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) # seems faster than delayed update, but might behave otherwise in distributed settings. # TODO add an option, and maybe enable it for replica mode? # with tf.control_dependencies([update_op1, update_op2]): # return tf.identity(xn, name='output') tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) return xn
def update_bn_ema(self, xn, batch_mean, batch_var, moving_mean, moving_var, decay): update_op1 = moving_averages.assign_moving_average(moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average(moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') # Only add to model var when we update them add_model_variable(moving_mean) add_model_variable(moving_var) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) return xn
def update_ema(xn, moving_max, moving_min, decay): batch_max = tf.reduce_max(xn, axis=[0, 1, 2]) batch_min = tf.reduce_min(xn, axis=[0, 1, 2]) update_op1 = moving_averages.assign_moving_average(moving_max, batch_max, decay, zero_debias=False, name='max_ema_op') update_op2 = moving_averages.assign_moving_average(moving_min, batch_min, decay, zero_debias=False, name='min_ema_op') # Only add to model var when we update them add_model_variable(moving_min) add_model_variable(moving_max) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op1) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_op2) return xn
def QuantizedWeight(name, x, n, nbit=2): """ Quantize weight. Args: x (tf.Tensor): a 4D tensor. Must have known number of channels, but can have other unknown dimensions. name (str): operator's name. n (int or double): variance of weight initialization. nbit (int): number of bits of quantized weight. Defaults to 2. Returns: tf.Tensor with attribute `variables`. Variable Names: * ``basis``: basis of quantized weight. Note: About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed by main training tower. This is consistent with most frameworks. """ num_filters = x.get_shape().as_list()[-1] init_basis = [] base = NORM_PPF_0_75 * ((2. / n)**0.5) / (2**(nbit - 1)) for j in range(nbit): init_basis.append([(2**j) * base for i in range(num_filters)]) init_basis = tf.constant_initializer(init_basis) bit_dims = [nbit, num_filters] num_levels = 2**nbit delta = EPS # initialize level multiplier init_level_multiplier = [] for i in range(num_levels): level_multiplier_i = [0. for j in range(nbit)] level_number = i for j in range(nbit): binary_code = level_number % 2 if binary_code == 0: binary_code = -1 level_multiplier_i[j] = float(binary_code) level_number = level_number // 2 init_level_multiplier.append(level_multiplier_i) # initialize threshold multiplier init_thrs_multiplier = [] for i in range(1, num_levels): thrs_multiplier_i = [0. for j in range(num_levels)] thrs_multiplier_i[i - 1] = 0.5 thrs_multiplier_i[i] = 0.5 init_thrs_multiplier.append(thrs_multiplier_i) with tf.variable_scope(name): basis = tf.get_variable('basis', bit_dims, tf.float32, initializer=init_basis, trainable=False) level_codes = tf.constant(init_level_multiplier) thrs_multiplier = tf.constant( init_thrs_multiplier ) # ValueError: Cannot create a tensor proto whose content is larger than 2GB. sum_multiplier = tf.constant( 1., shape=[1, tf.reshape(x, [-1, num_filters]).get_shape()[0]]) sum_multiplier_basis = tf.constant(1., shape=[1, nbit]) # calculate levels and sort levels = tf.matmul(level_codes, basis) levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels) levels = tf.reverse(levels, [-1]) sort_id = tf.reverse(sort_id, [-1]) levels = tf.transpose(levels, [1, 0]) sort_id = tf.transpose(sort_id, [1, 0]) # calculate threshold thrs = tf.matmul(thrs_multiplier, levels) # calculate level codes per channel reshape_x = tf.reshape(x, [-1, num_filters]) level_codes_channelwise_dims = tf.stack( [num_levels * num_filters, nbit]) level_codes_channelwise = tf.fill(level_codes_channelwise_dims, 0.) for i in range(num_levels): eq = tf.equal(sort_id, i) level_codes_channelwise = tf.where( tf.reshape(eq, [-1]), level_codes_channelwise + level_codes[i], level_codes_channelwise) level_codes_channelwise = tf.reshape(level_codes_channelwise, [num_levels, num_filters, nbit]) # calculate output y and its binary code y = tf.zeros_like(x) + levels[0] # output zero_dims = tf.stack([tf.shape(reshape_x)[0] * num_filters, nbit]) bits_y = tf.fill(zero_dims, -1.) zero_y = tf.zeros_like(x) zero_bits_y = tf.fill(zero_dims, 0.) zero_bits_y = tf.reshape(zero_bits_y, [-1, num_filters, nbit]) for i in range(num_levels - 1): g = tf.greater(x, thrs[i]) y = tf.where(g, zero_y + levels[i + 1], y) bits_y = tf.where( tf.reshape(g, [-1]), tf.reshape(zero_bits_y + level_codes_channelwise[i + 1], [-1, nbit]), bits_y) bits_y = tf.reshape(bits_y, [-1, num_filters, nbit]) ctx = get_current_tower_context() # current tower context # training if ctx.is_main_training_tower: BT = tf.transpose(bits_y, [2, 0, 1]) # calculate BTxB BTxB = [] for i in range(nbit): for j in range(nbit): BTxBij = tf.multiply(BT[i], BT[j]) BTxBij = tf.matmul(sum_multiplier, BTxBij) if i == j: mat_one = tf.ones([1, num_filters]) BTxBij = BTxBij + (delta * mat_one) # + E BTxB.append(BTxBij) BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit, num_filters]) # calculate inverse of BTxB if nbit > 2: BTxB_transpose = tf.transpose(BTxB, [2, 0, 1]) # 1) naive # BTxB_inv = tf.matrix_inverse(BTxB_transpose) # 2) try, except try: BTxB_inv = tf.matrix_inverse(BTxB_transpose, adjoint=None, name=None) except: BTxB_ttt = tf.add( BTxB_transpose, tf.math.scalar_mul(tf.identity((BTxB_transpose.shape)), 1e-6)) BTxB_inv = tf.matrix_inverse(BTxB_ttt, adjoint=None, name=None) BTxB_inv = tf.transpose(BTxB_inv, [1, 2, 0]) elif nbit == 2: det = tf.multiply(BTxB[0][0], BTxB[1][1]) - tf.multiply( BTxB[0][1], BTxB[1][0]) inv = [] inv.append(BTxB[1][1] / det) inv.append(-BTxB[0][1] / det) inv.append(-BTxB[1][0] / det) inv.append(BTxB[0][0] / det) BTxB_inv = tf.reshape(tf.stack(values=inv), [nbit, nbit, num_filters]) elif nbit == 1: BTxB_inv = tf.reciprocal(BTxB) # calculate BTxX BTxX = [] for i in range(nbit): BTxXi0 = tf.multiply(BT[i], reshape_x) BTxXi0 = tf.matmul(sum_multiplier, BTxXi0) BTxX.append(BTxXi0) BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, num_filters]) BTxX = BTxX + (delta * basis) # + basis # calculate new basis new_basis = [] for i in range(nbit): new_basis_i = tf.multiply(BTxB_inv[i], BTxX) new_basis_i = tf.matmul(sum_multiplier_basis, new_basis_i) add_moving_summary( tf.reduce_mean(new_basis_i, name='new_basis_bit' + str(i))) new_basis.append(new_basis_i) new_basis = tf.reshape(tf.stack(values=new_basis), [nbit, num_filters]) # create moving averages op updata_moving_basis = moving_averages.assign_moving_average( basis, new_basis, MOVING_AVERAGES_FACTOR) add_model_variable(basis) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis) # add_moving_summary(tf.identity(basis, name='basis'), tf.identity(new_basis, name='basis_new')) # add_moving_summary(tf.identity(basis, name='basis')) y = x + tf.stop_gradient(-x) + tf.stop_gradient(y) # gradient: y=x y.variables = VariableHolder(basis=basis) return y
def BatchNormV2(x, use_local_stat=None, decay=0.9, epsilon=1e-5): """ Batch normalization layer as described in: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_. :param input: a NHWC or NC tensor :param use_local_stat: bool. whether to use mean/var of this batch or the moving average. Default to True in training and False in inference. :param decay: decay rate. default to 0.9. :param epsilon: default to 1e-5. Note that only the first training tower maintains a moving average. """ shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" if len(shape) == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) beta = tf.get_variable('beta', [n_out], initializer=tf.constant_initializer()) gamma = tf.get_variable('gamma', [n_out], initializer=tf.constant_initializer(1.0)) # x * gamma + beta ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training if use_local_stat != ctx.is_training: logger.warn("[BatchNorm] use_local_stat != is_training") moving_mean = tf.get_variable('mean/EMA', [n_out], initializer=tf.constant_initializer(), trainable=False) moving_var = tf.get_variable('variance/EMA', [n_out], initializer=tf.constant_initializer(), trainable=False) if use_local_stat: xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta, epsilon=epsilon, is_training=True) # maintain EMA only in the main training tower if ctx.is_main_training_tower: update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) else: assert not ctx.is_training, "In training, local statistics has to be used!" # TODO do I need to add_model_variable. # consider some fixed-param tasks, such as load model and fine tune one layer # fused seems slower in inference #xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta, #moving_mean, moving_var, #epsilon=epsilon, is_training=False, name='output') xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma, epsilon) # TODO for other towers, maybe can make it depend some op later if ctx.is_main_training_tower: with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output') else: return tf.identity(xn, name='output')
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=tf.constant_initializer(1.0), data_format='NHWC', internal_update=False): """ Batch Normalization layer, as described in the paper: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_. Args: x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format. use_local_stat (bool): whether to use mean/var of the current batch or the moving average. Defaults to True in training and False in inference. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. gamma_init: initializer for gamma (the scale). internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer which will be slightly slower. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``use_local_stat`` and ``ctx.is_training``: * ``use_local_stat == is_training``: standard BN, EMA are maintained during training and used during inference. * ``use_local_stat and not is_training``: still use local (batch) statistics in inference. * ``not use_local_stat and is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, use_scale, use_bias, gamma_init) ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_number() >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training." ) # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: # non-fused op is faster for inference # TODO test if this is still true if ndims == 4 and data_format == 'NCHW': [g, b, mm, mv] = [ reshape_for_bn(_, ndims, n_out, data_format) for _ in [gamma, beta, moving_mean, moving_var] ] xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma, epsilon) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if use_scale: vh.gamma = gamma if use_bias: vh.beta = beta return ret
def QuantizedActiv(x, nbit=2): """ Quantize activation. Args: x (tf.Tensor): a 4D tensor. nbit (int): number of bits of quantized activation. Defaults to 2. Returns: tf.Tensor with attribute `variables`. Variable Names: * ``basis``: basis of quantized activation. Note: About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed by main training tower. This is consistent with most frameworks. """ init_basis = [(NORM_PPF_0_75 * 2 / (2**nbit - 1)) * (2.**i) for i in range(nbit)] #初始化基向量 init_basis = tf.constant_initializer(init_basis) bit_dims = [nbit, 1] num_levels = 2**nbit #量化级别数 # initialize level multiplier #该过程为初始化一个量化级别码,是一个2**nbit个元素的列表,其中每个元素都是一个nbit位的 {0,1}码,例如,对于7,那么对应的编码为:{0,1,2,3: 1,1,1,0} init_level_multiplier = [] for i in range(0, num_levels): level_multiplier_i = [0. for j in range(nbit)] level_number = i for j in range(nbit): level_multiplier_i[j] = float(level_number % 2) level_number = level_number // 2 init_level_multiplier.append(level_multiplier_i) #该过程初始化一个 threshold multiplier,共有2**nbit-1 个元素,其中每个元素都是一个具有2** nbit个元素的列表,列表中的值为初始值 #例如:[[0.5,0,5,0,0,...,0,0], [0,0.5,0.5,0,0,...,0], ... , [0,0,0,...,0,0,0.5,0.5]] # initialize threshold multiplier init_thrs_multiplier = [] for i in range(1, num_levels): thrs_multiplier_i = [0. for j in range(num_levels)] thrs_multiplier_i[i - 1] = 0.5 thrs_multiplier_i[i] = 0.5 iniinit_thrs_multipliert_thrs_multiplier.append(thrs_multiplier_i) # init_thrs_multiplier_shape:15_16 with tf.variable_scope('ActivationQuantization'): basis = tf.get_variable('basis', bit_dims, tf.float32, initializer=init_basis, trainable=False) ctx = get_current_tower_context() # current tower context # calculate levels and sort level_codes = tf.constant(init_level_multiplier) levels = tf.matmul( level_codes, basis) #V*B表示每个量化级别所对应的量化值,例如0, v1, v2, v1+v2, v3, v3+v1 levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels) #从大到小排序 levels = tf.reverse(levels, [-1]) #从小到大 sort_id = tf.reverse(sort_id, [-1]) #对应的索引从小到大 levels = tf.transpose( levels, [1, 0]) #V*B表示每个量化级别所对应的量化值,以4bit为例,levels_shape:16*1 sort_id = tf.transpose(sort_id, [1, 0]) # calculate threshold thrs_multiplier = tf.constant(init_thrs_multiplier) #shape:15_16 thrs = tf.matmul( thrs_multiplier, levels ) #表示每个量化级别所对应的量化范围,论文中的(q(l-1)+q(l))/2, shape:15_16*16_1=15_1 # calculate output y and its binary code,给量化后的输出分配空间,包括量化值和量化码,假设x_shape:1_5_2_2 y = tf.zeros_like(x) # output reshape_x = tf.reshape(x, [-1]) #shape:1*5*2*2=20 zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit]) #shape:20_4 bits_y = tf.fill(zero_dims, 0.) zero_y = tf.zeros_like(x) zero_bits_y = tf.fill(zero_dims, 0.) for i in range(num_levels - 1): g = tf.greater(x, thrs[i]) #判断 y = tf.where(g, zero_y + levels[i + 1], y) #大于该级别threshold的值的, 量化值就等于该级别所对应的量化值 bits_y = tf.where(tf.reshape(g, [-1]), zero_bits_y + level_codes[sort_id[i + 1][0]], bits_y) #该级别所对应的量化码 # training if ctx.is_main_training_tower: BT = tf.matrix_transpose(bits_y) #shape:4_20 # calculate BTxB BTxB = [] for i in range(nbit): for j in range(nbit): BTxBij = tf.multiply(BT[i], BT[j]) BTxBij = tf.reduce_sum(BTxBij) BTxB.append(BTxBij) BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit]) BTxB_inv = tf.matrix_inverse(BTxB) # calculate BTxX BTxX = [] for i in range(nbit): BTxXi0 = tf.multiply(BT[i], reshape_x) BTxXi0 = tf.reduce_sum(BTxXi0) BTxX.append(BTxXi0) BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1]) new_basis = tf.matmul(BTxB_inv, BTxX) # calculate new basis # create moving averages op updata_moving_basis = moving_averages.assign_moving_average( basis, new_basis, MOVING_AVERAGES_FACTOR) add_model_variable(basis) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis) for i in range(nbit): tf.summary.scalar('basis%d' % i, new_basis[i][0]) x_clip = tf.minimum(x, levels[num_levels - 1]) # gradient clip y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient( y) # gradient: y=clip(x) y.variables = VariableHolder(basis=basis) return y
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5): """ Batch normalization layer, as described in the paper: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_. Args: x (tf.Tensor): a NHWC or NC tensor. use_local_stat (bool): whether to use mean/var of the current batch or the moving average. Defaults to True in training and False in inference. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: In multi-tower training, only the first training tower maintains a moving average. This is consistent with most frameworks. """ shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" if len(shape) == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) beta = tf.get_variable('beta', [n_out], initializer=tf.constant_initializer()) gamma = tf.get_variable('gamma', [n_out], initializer=tf.constant_initializer(1.0)) # x * gamma + beta ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training if use_local_stat != ctx.is_training: logger.warn("[BatchNorm] use_local_stat != is_training") moving_mean = tf.get_variable('mean/EMA', [n_out], initializer=tf.constant_initializer(), trainable=False) moving_var = tf.get_variable('variance/EMA', [n_out], initializer=tf.constant_initializer(), trainable=False) if use_local_stat: xn, batch_mean, batch_var = tf.nn.fused_batch_norm(x, gamma, beta, epsilon=epsilon, is_training=True) # maintain EMA only in the main training tower if ctx.is_main_training_tower: # TODO a way to use debias in multitower. update_op1 = moving_averages.assign_moving_average( moving_mean, batch_mean, decay, zero_debias=False, name='mean_ema_op') update_op2 = moving_averages.assign_moving_average( moving_var, batch_var, decay, zero_debias=False, name='var_ema_op') add_model_variable(moving_mean) add_model_variable(moving_var) else: assert not ctx.is_training, "In training, local statistics has to be used!" # TODO do I need to add_model_variable. # consider some fixed-param tasks, such as load model and fine tune one layer # fused seems slower in inference # xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta, # moving_mean, moving_var, # epsilon=epsilon, is_training=False, name='output') xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma, epsilon) if len(shape) == 2: xn = tf.squeeze(xn, [1, 2]) # TODO for other towers, maybe can make it depend some op later # TODO update it later (similar to slim) might be faster? if ctx.is_main_training_tower: with tf.control_dependencies([update_op1, update_op2]): return tf.identity(xn, name='output') else: return tf.identity(xn, name='output')
def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, gamma_initializer=tf.ones_initializer(), data_format='channels_last', internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but difference in the following: 1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, scale, center, gamma_initializer) ctx = get_current_tower_context() use_local_stat = training if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: inputs = tf.reshape( inputs, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( inputs, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_tuple() >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training." ) # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, _, _ = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: if ndims == 4: xn, _, _ = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: xn = tf.nn.batch_normalization(inputs, moving_mean, moving_var, beta, gamma, epsilon) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=None, data_format='channels_last'): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. This implementation is a wrapper around `tf.layers.batch_normalization`. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation. * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation. """ shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'channels_last' # error using NCHW? (see #190) x = tf.reshape(x, [-1, 1, 1, shape[1]]) ctx = get_current_tower_context() coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) layer = tf.layers.BatchNormalization( axis=1 if data_format == 'channels_first' else 3, momentum=decay, epsilon=epsilon, center=use_bias, scale=use_scale, renorm=True, renorm_clipping={ 'rmin': 1.0 / rmax, 'rmax': rmax, 'dmax': dmax}, renorm_momentum=0.99, gamma_initializer=gamma_init, fused=False) xn = layer.apply(x, training=ctx.is_training, scope=tf.get_variable_scope()) if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) else: # only run UPDATE_OPS in the first tower restore_collection(coll_bk) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) ret = tf.identity(xn, name='output') # TODO not sure whether to add moving_mean/moving_var to VH now vh = ret.variables = VariableHolder() if use_scale: vh.gamma = layer.gamma if use_bias: vh.beta = layer.beta return ret
def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5): shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] # channel assert n_out is not None beta = tf.get_variable('beta', [n_out], initializer=tf.constant_initializer()) gamma = tf.get_variable('gamma', [n_out], initializer=tf.constant_initializer(1.0)) if len(shape) == 2: batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False) else: batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False) # just to make a clear name. batch_mean = tf.identity(batch_mean, 'mean') batch_var = tf.identity(batch_var, 'variance') emaname = 'EMA' ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training if use_local_stat != ctx.is_training: logger.warn("[BatchNorm] use_local_stat != is_training") if use_local_stat: # training tower if ctx.is_training: # reuse = tf.get_variable_scope().reuse with tf.variable_scope(tf.get_variable_scope(), reuse=False): # BatchNorm in reuse scope can be tricky! Moving mean/variance are not reused with tf.name_scope(None): # https://github.com/tensorflow/tensorflow/issues/2740 # if reuse=True, try to find and use the existing statistics # how to use multiple tensors to update one EMA? seems impossbile ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) ema_apply_op = ema.apply([batch_mean, batch_var]) ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var) if ctx.is_main_training_tower: # inside main training tower add_model_variable(ema_mean) add_model_variable(ema_var) else: # no apply() is called here, no magic vars will get created, # no reuse issue will happen assert not ctx.is_training with tf.name_scope(None): ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) mean_var_name = ema.average_name(batch_mean) var_var_name = ema.average_name(batch_var) if ctx.is_main_tower: # main tower, but needs to use global stat. global stat must be from outside # when reuse=True, the desired variable name could # actually be different, because a different var is created # for different reuse tower ema_mean = tf.get_variable('mean/' + emaname, [n_out]) ema_var = tf.get_variable('variance/' + emaname, [n_out]) else: # use statistics in another tower G = tf.get_default_graph() ema_mean = ctx.find_tensor_in_main_tower(G, mean_var_name + ':0') ema_var = ctx.find_tensor_in_main_tower(G, var_var_name + ':0') if use_local_stat: batch = tf.cast(tf.shape(x)[0], tf.float32) mul = tf.where(tf.equal(batch, 1.0), 1.0, batch / (batch - 1)) batch_var = batch_var * mul # use unbiased variance estimator in training with tf.control_dependencies([ema_apply_op] if ctx.is_training else []): # only apply EMA op if is_training return tf.nn.batch_normalization( x, batch_mean, batch_var, beta, gamma, epsilon, 'output') else: return tf.nn.batch_normalization( x, ema_mean, ema_var, beta, gamma, epsilon, 'output')
def BatchRenorm(x, rmax, dmax, momentum=0.9, epsilon=1e-5, center=True, scale=True, gamma_initializer=None, data_format='channels_last'): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. This implementation is a wrapper around `tf.layers.batch_normalization`. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation. * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation. """ shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'channels_first' ctx = get_current_tower_context() coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) layer = tf.layers.BatchNormalization( axis=1 if data_format == 'channels_first' else 3, momentum=momentum, epsilon=epsilon, center=center, scale=scale, renorm=True, renorm_clipping={ 'rmin': 1.0 / rmax, 'rmax': rmax, 'dmax': dmax }, renorm_momentum=0.99, gamma_initializer=gamma_initializer, fused=False, _reuse=tf.get_variable_scope().reuse) xn = layer.apply(x, training=ctx.is_training, scope=tf.get_variable_scope()) if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) else: # only run UPDATE_OPS in the first tower restore_collection(coll_bk) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) ret = tf.identity(xn, name='output') # TODO not sure whether to add moving_mean/moving_var to VH now vh = ret.variables = VariableHolder() if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta return ret
def sync_batch_norm(inputs, decay=0.999, axis=-1, epsilon=0.001, activation_fn=None, updates_collections=tf.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, trainable=True, scope=None, num_dev=1): ''' num_dev is how many gpus you use. this function is from https://github.com/jianlong-yuan/syncbn-tensorflow/blob/master/syncbn.py ''' # shape of inputs is [batch, height, width, depth] num_outputs = inputs.get_shape().as_list()[-1] # print (f"num_outputs = {num_outputs}") # 3 if scope is None: scope = 'batch_normalization' with tf.variable_scope(scope, reuse=reuse): # initializer, gamma and beta is trainable, moving_mean and moving_var is not gamma = tf.get_variable(name='gamma', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=trainable, collections=variables_collections) beta = tf.get_variable(name='beta', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=trainable, collections=variables_collections) moving_mean = tf.get_variable(name='moving_mean', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=False, collections=variables_collections) moving_var = tf.get_variable(name='moving_variance', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=False, collections=variables_collections) # is_training and trainable is logical and # this is same with [math_ops.logical_and())] # (https://github.com/tensorflow/tensorflow/blob/ # 508f76b1d9925304cedd56d51480ec380636cb82/tensorflow/ # python/keras/layers/normalization.py#L621) if is_training and trainable: # only one GPU if num_dev == 1: mean, var = tf.nn.moments(inputs, axes=axis) # multi GPUs else: # avarage moving_mean and moving_var in multi GPUs shared_name = tf.get_variable_scope().name batch_mean = tf.reduce_mean(inputs, axis=axis) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=axis) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) mean = batch_mean var = batch_mean_square - tf.square(batch_mean) outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma, epsilon) # print (outputs.device) # /device:GPU:1 # those code block is executed in every GPUs # just assign moving_mean and moving_var in GPU:0 if int(outputs.device[-1]) == 0: update_moving_mean_op = tf.assign( moving_mean, moving_mean * decay + mean * (1 - decay)) update_moving_var_op = tf.assign( moving_var, moving_var * decay + var * (1 - decay)) add_model_variable(moving_mean) add_model_variable(moving_var) if updates_collections is None: with tf.control_dependencies( [update_moving_mean_op, update_moving_var_op]): outputs = tf.identity(outputs) else: tf.add_to_collections(updates_collections, update_moving_mean_op) tf.add_to_collections(updates_collections, update_moving_var_op) outputs = tf.identity(outputs) else: outputs = tf.identity(outputs) else: outputs, _, _ = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, is_training=False) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def syncBatchNorm(inputs, axis=-1, momentum=0.99, epsilon=0.001, updates_collections=tf.GraphKeys.UPDATE_OPS, reuse=None, variables_collections=None, training=False, trainable=True, name=None, GPUNumber=1): ''' this function is from https://github.com/jianlong-yuan/syncbn-tensorflow/blob/master/syncbn.py ''' shapeList = inputs.get_shape().as_list() num_outputs = shapeList[axis] # print (f"num_outputs = {num_outputs}") # 512 axes = [i for i in range(len(shapeList))] # when the dimension is 1, axes = [], this also run well! del axes[axis] # print (f"axes = {axes}") # [0, 1, 2] if name is None: name = 'batch_normalization' with tf.variable_scope(name, reuse=reuse) as scope: # initializer, gamma and beta is trainable, moving_mean and moving_var is not gamma = tf.get_variable(name='gamma', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=trainable, collections=variables_collections) beta = tf.get_variable(name='beta', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=trainable, collections=variables_collections) moving_mean = tf.get_variable(name='moving_mean', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=False, collections=variables_collections) moving_var = tf.get_variable(name='moving_variance', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=False, collections=variables_collections) def branchTrue(): ''' update the batch mean and batch variance ''' # only one GPU if GPUNumber == 1: batch_mean = tf.reduce_mean(inputs, axis=axes, name="batch_mean") batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=axes) # multi GPUs else: # avarage moving_mean and moving_var in multi GPUs shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) batch_mean = tf.reduce_mean(inputs, axis=axes) # Utilize NCCL batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=GPUNumber, shared_name=shared_name + '_NCCL_mean') * (1.0 / GPUNumber) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=axes) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=GPUNumber, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / GPUNumber) batch_var = batch_mean_square - tf.square(batch_mean) outputs = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) return outputs, batch_mean, batch_var def branchFalse(): ''' the same with moving_mean and moving_var ''' outputs = tf.nn.batch_normalization(inputs, moving_mean, moving_var, beta, gamma, epsilon) # use the default tensor, this code will not update moving_mean and moving_var # for batch_mean+(moving_mean-batch_mean)*momentum = moving_mean # is batch_mean == moving_mean with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): batch_mean = tf.get_variable("moving_mean") batch_var = tf.get_variable("moving_variance") return outputs, batch_mean, batch_var outputs, batch_mean, batch_var = tf.cond( tf.math.logical_and(training, trainable), branchTrue, branchFalse) # those code block is executed in every GPUs # just assign moving_mean and moving_var in GPU:0 if int(outputs.device[-1]) == 0: update_moving_mean_op = tf.assign( moving_mean, batch_mean + (moving_mean - batch_mean) * momentum) update_moving_var_op = tf.assign( moving_var, batch_var + (moving_var - batch_var) * momentum) add_model_variable(moving_mean) add_model_variable(moving_var) if updates_collections is None: with tf.control_dependencies( [update_moving_mean_op, update_moving_var_op]): outputs = tf.identity(outputs) else: tf.add_to_collections(updates_collections, update_moving_mean_op) tf.add_to_collections(updates_collections, update_moving_var_op) outputs = tf.identity(outputs) else: outputs = tf.identity(outputs) return outputs
def BatchNorm(x, use_local_stat=False, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=tf.constant_initializer(1.0), data_format='NCHW', internal_update=False, scope="bn"): global is_training with tf.variable_scope(scope): shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables( n_out, use_scale, use_bias, gamma_init) use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: x = tf.reshape( x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if is_training: # so ugly #assert get_tf_version_number() >= 1.4, \ # "Fine tuning a BatchNorm model with fixed statistics is only " \ # "supported after https://github.com/tensorflow/tensorflow/pull/12580 " #if ctx.is_main_training_tower: # only warn in first tower # logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, _, _ = tf.nn.fused_batch_norm(x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: # non-fused op is faster for inference # TODO test if this is still true if ndims == 4 and data_format == 'NCHW': [g, b, mm, mv] = [ reshape_for_bn(_, ndims, n_out, data_format) for _ in [gamma, beta, moving_mean, moving_var] ] xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization(x, moving_mean, moving_var, beta, gamma, epsilon) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA #if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if use_local_stat: # and ctx.is_main_training_tower: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay, internal_update) else: ret = tf.identity(xn, name='output') return ret
def QuantizedActiv(x, nbit=2): """ Quantize activation. Args: x (tf.Tensor): a 4D tensor. nbit (int): number of bits of quantized activation. Defaults to 2. Returns: tf.Tensor with attribute `variables`. Variable Names: * ``basis``: basis of quantized activation. Note: About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed by main training tower. This is consistent with most frameworks. """ init_basis = [(NORM_PPF_0_75 * 2 / (2**nbit - 1)) * (2.**i) for i in range(nbit)] init_basis = tf.constant_initializer(init_basis) bit_dims = [nbit, 1] num_levels = 2**nbit # initialize level multiplier init_level_multiplier = [] for i in range(0, num_levels): level_multiplier_i = [0. for j in range(nbit)] level_number = i for j in range(nbit): level_multiplier_i[j] = float(level_number % 2) level_number = level_number // 2 init_level_multiplier.append(level_multiplier_i) # initialize threshold multiplier init_thrs_multiplier = [] for i in range(1, num_levels): thrs_multiplier_i = [0. for j in range(num_levels)] thrs_multiplier_i[i - 1] = 0.5 thrs_multiplier_i[i] = 0.5 init_thrs_multiplier.append(thrs_multiplier_i) with tf.variable_scope('ActivationQuantization'): basis = tf.get_variable('basis', bit_dims, tf.float32, initializer=init_basis, trainable=False) ctx = get_current_tower_context() # current tower context # calculate levels and sort level_codes = tf.constant(init_level_multiplier) levels = tf.matmul(level_codes, basis) levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels) levels = tf.reverse(levels, [-1]) sort_id = tf.reverse(sort_id, [-1]) levels = tf.transpose(levels, [1, 0]) sort_id = tf.transpose(sort_id, [1, 0]) # calculate threshold thrs_multiplier = tf.constant(init_thrs_multiplier) thrs = tf.matmul(thrs_multiplier, levels) # calculate output y and its binary code y = tf.zeros_like(x) # output reshape_x = tf.reshape(x, [-1]) zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit]) bits_y = tf.fill(zero_dims, 0.) zero_y = tf.zeros_like(x) zero_bits_y = tf.fill(zero_dims, 0.) for i in range(num_levels - 1): g = tf.greater(x, thrs[i]) y = tf.where(g, zero_y + levels[i + 1], y) bits_y = tf.where(tf.reshape(g, [-1]), zero_bits_y + level_codes[sort_id[i + 1][0]], bits_y) # training if ctx.is_main_training_tower: BT = tf.matrix_transpose(bits_y) # calculate BTxB BTxB = [] for i in range(nbit): for j in range(nbit): BTxBij = tf.multiply(BT[i], BT[j]) BTxBij = tf.reduce_sum(BTxBij) # all dimensions are reduced, and a tensor with a single element is returned. i.e. 6 BTxB.append(BTxBij) BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit]) # 1) naive # BTxB_inv = tf.matrix_inverse(BTxB) # 2) try excpet ->doesn't work well due to poor tf.matrix_inverse # try: # BTxB_inv = tf.matrix_inverse(BTxB, adjoint=None, name=None) # except: # BTxB_ttt = tf.add(BTxB, tf.math.scalar_mul(tf.identity((BTxB.shape)), 1e-4)) # BTxB_inv = tf.matrix_inverse(BTxB_ttt, adjoint=None, name=None) # calculate BTxX BTxX = [] for i in range(nbit): BTxXi0 = tf.multiply(BT[i], reshape_x) BTxXi0 = tf.reduce_sum(BTxXi0) BTxX.append(BTxXi0) BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1]) # new_basis = tf.matmul(BTxB_inv, BTxX) # calculate new basis # 3) gaussian elimination new_basis = tf.linalg.lstsq(BTxB, BTxX, fast=False, l2_regularizer=1e-5) # create moving averages op updata_moving_basis = moving_averages.assign_moving_average( basis, new_basis, MOVING_AVERAGES_FACTOR) add_model_variable(basis) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis) for i in range(nbit): tf.summary.scalar('basis%d' % i, new_basis[i][0]) x_clip = tf.minimum(x, levels[num_levels - 1]) # gradient clip y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient( y) # gradient: y=clip(x) y.variables = VariableHolder(basis=basis) return y
def BatchNorm(inputs, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, gamma_initializer=tf.ones_initializer(), data_format='channels_last', internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but difference in the following: 1. Accepts `data_format` rather than `axis`. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, scale, center, gamma_initializer) ctx = get_current_tower_context() use_local_stat = training if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: inputs = tf.reshape(inputs, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( inputs, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_number() >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, _, _ = tf.nn.fused_batch_norm( inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: if ndims == 4: xn, _, _ = tf.nn.fused_batch_norm( inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization( inputs, moving_mean, moving_var, beta, gamma, epsilon) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but different in the following: 1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): if TF_version >= 1.5: layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, virtual_batch_size=virtual_batch_size, fused=True) else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta return ret
def BatchNorm_SplitGPU(x, use_local_stat=None, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=tf.constant_initializer(1.0), data_format='NHWC', internal_update=False, split_num = 1): """ """ print split_num if data_format == 'channels_last': data_format = 'NHWC' assert data_format == 'NHWC' shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, use_scale, use_bias, gamma_init) ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/S_n x H x W x C*S_n beta_, gamma_ = None, None beta_ = tf.reshape([beta]*split_num, [-1]) gamma_ = tf.reshape([gamma]*split_num, [-1]) xn, batch_mean, batch_var = tf.nn.fused_batch_norm(inputs, gamma_, beta_,epsilon=epsilon,is_training=True, data_format=data_format) xn = tf.concat(tf.split(xn, split_num, 3), 0) """ """ # inputs = tf.concat(tf.split(x, split_num, 0), -1) # N/split_num x H x W x C*split_num # axis = [0, 1, 2] # batch_mean, batch_var = tf.nn.moments(inputs, axis) # C*split_num # beta_, gamma_ = None, None # beta_ = tf.reshape([beta]*split_num, [-1]) # gamma_ = tf.reshape([gamma]*split_num, [-1]) # xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta_, gamma_, epsilon) # xn = tf.concat(tf.split(xn, split_num, 3), 0) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_number() < 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: if ndims == 4 and data_format == 'NCHW': [g, b, mm, mv] = [reshape_for_bn(_, ndims, n_out, data_format) for _ in [gamma, beta, moving_mean, moving_var]] xn = tf.nn.batch_normalization(x, mm, mv, b, g, epsilon) batch_mean = tf.concat([moving_mean] * split_num, 0) batch_var = tf.concat([moving_var] * split_num, 0) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) batch_mean = tf.concat([moving_mean] * split_num, 0) batch_var = tf.concat([moving_var] * split_num, 0) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: # print (xn) ret = update_bn_ema(xn, batch_mean[:n_out], batch_var[:n_out], moving_mean, moving_var, decay, internal_update) else: ret = tf.identity(xn, name='output') ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if use_scale: vh.gamma = gamma if use_bias: vh.beta = beta assert batch_mean is not None, 'batch_mean outputs is None' return ret
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False): """ Mostly equivalent to `tf.layers.batch_normalization`, but different in the following: 1. Accepts `data_format` when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from `TowerContext`. 4. Support the `internal_update` option. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_number() if not training and ctx.is_training: assert TF_version >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable( {'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA'}): if TF_version >= 1.5: layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, virtual_batch_size=virtual_batch_size, fused=True ) else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization( axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=True ) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta return ret
def sync_batch_norm(inputs, is_training=True, scope=None, red_axises=[0, 1, 2], bn_decay=0.999, epsilon=0.001, activation_fn=None, updates_collections=tf.GraphKeys.UPDATE_OPS, reuse=None, variables_collections=None, is_trainable=True, num_dev=3): ''' num_dev is how many gpus you use. ''' # red_axises = [0, 1, 2] num_outputs = inputs.get_shape().as_list()[-1] if scope is None: scope = 'BatchNorm' with tf.variable_scope(scope, 'BatchNorm', reuse=reuse): gamma = tf.get_variable(name='gamma', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=is_trainable, collections=variables_collections) beta = tf.get_variable(name='beta', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=is_trainable, collections=variables_collections) moving_mean = tf.get_variable(name='moving_mean', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=False, collections=variables_collections) moving_var = tf.get_variable(name='moving_variance', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=False, collections=variables_collections) if is_training is not None and is_trainable is not None: if num_dev == 1: mean, var = tf.nn.moments(inputs, red_axises) else: shared_name = tf.get_variable_scope().name batch_mean = tf.reduce_mean(inputs, axis=red_axises) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axises) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) mean = batch_mean var = batch_mean_square - tf.square(batch_mean) outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma, epsilon) if int(outputs.device[-1]) == 0: update_moving_mean_op = tf.assign( moving_mean, moving_mean * bn_decay + mean * (1 - bn_decay)) update_moving_var_op = tf.assign( moving_var, moving_var * bn_decay + var * (1 - bn_decay)) add_model_variable(moving_mean) add_model_variable(moving_var) if updates_collections is None: with tf.control_dependencies( [update_moving_mean_op, update_moving_var_op]): outputs = tf.identity(outputs) else: tf.add_to_collections(updates_collections, update_moving_mean_op) tf.add_to_collections(updates_collections, update_moving_var_op) outputs = tf.identity(outputs) else: outputs = tf.identity(outputs) else: outputs, _, _ = tf.nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, is_training=False) #if activation_fn is not None: # outputs = activation_fn(outputs) return outputs
def BatchNormV1(x, use_local_stat=None, decay=0.9, epsilon=1e-5): shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] # channel assert n_out is not None beta = tf.get_variable('beta', [n_out], initializer=tf.constant_initializer()) gamma = tf.get_variable('gamma', [n_out], initializer=tf.constant_initializer(1.0)) if len(shape) == 2: batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False) else: batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False) # just to make a clear name. batch_mean = tf.identity(batch_mean, 'mean') batch_var = tf.identity(batch_var, 'variance') emaname = 'EMA' ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training if use_local_stat != ctx.is_training: logger.warn("[BatchNorm] use_local_stat != is_training") if use_local_stat: # training tower if ctx.is_training: # reuse = tf.get_variable_scope().reuse with tf.variable_scope(tf.get_variable_scope(), reuse=False): # BatchNorm in reuse scope can be tricky! Moving mean/variance are not reused with tf.name_scope( None ): # https://github.com/tensorflow/tensorflow/issues/2740 # TODO if reuse=True, try to find and use the existing statistics # how to use multiple tensors to update one EMA? seems impossbile ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) ema_apply_op = ema.apply([batch_mean, batch_var]) ema_mean, ema_var = ema.average(batch_mean), ema.average( batch_var) if ctx.is_main_training_tower: # inside main training tower add_model_variable(ema_mean) add_model_variable(ema_var) else: # no apply() is called here, no magic vars will get created, # no reuse issue will happen assert not ctx.is_training with tf.name_scope(None): ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) mean_var_name = ema.average_name(batch_mean) var_var_name = ema.average_name(batch_var) if ctx.is_main_tower: # main tower, but needs to use global stat. global stat must be from outside # TODO when reuse=True, the desired variable name could # actually be different, because a different var is created # for different reuse tower ema_mean = tf.get_variable('mean/' + emaname, [n_out]) ema_var = tf.get_variable('variance/' + emaname, [n_out]) else: # use statistics in another tower G = tf.get_default_graph() ema_mean = ctx.find_tensor_in_main_tower( G, mean_var_name + ':0') ema_var = ctx.find_tensor_in_main_tower(G, var_var_name + ':0') if use_local_stat: batch = tf.cast(tf.shape(x)[0], tf.float32) mul = tf.where(tf.equal(batch, 1.0), 1.0, batch / (batch - 1)) batch_var = batch_var * mul # use unbiased variance estimator in training with tf.control_dependencies( [ema_apply_op] if ctx.is_training else []): # only apply EMA op if is_training return tf.nn.batch_normalization(x, batch_mean, batch_var, beta, gamma, epsilon, 'output') else: return tf.nn.batch_normalization(x, ema_mean, ema_var, beta, gamma, epsilon, 'output')
def BatchRenorm(x, rmax, dmax, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, data_format='NHWC'): """ Batch Renormalization layer, as described in the paper: `Batch Renormalization: Towards Reducing Minibatch Dependence in Batch-Normalized Models <https://arxiv.org/abs/1702.03275>`_. This implementation is a wrapper around `tf.layers.batch_normalization`. Args: x (tf.Tensor): a NHWC or NC tensor. rmax, dmax (tf.Tensor): a scalar tensor, the maximum allowed corrections. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. * ``gamma``: the scale term. Input will be transformed by ``x * gamma + beta``. * ``moving_mean, renorm_mean, renorm_mean_weight``: See TF documentation. * ``moving_variance, renorm_stddev, renorm_stddev_weight``: See TF documentation. """ shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' # error using NCHW? (see #190) x = tf.reshape(x, [-1, 1, 1, shape[1]]) if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchRenorm cannot have unknown channels!" ctx = get_current_tower_context() coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) layer = tf.layers.BatchNormalization( axis=1 if data_format == 'NCHW' else 3, momentum=decay, epsilon=epsilon, center=use_bias, scale=use_scale, renorm=True, renorm_clipping={ 'rmin': 1.0 / rmax, 'rmax': rmax, 'dmax': dmax}, renorm_momentum=0.99, fused=False) xn = layer.apply(x, training=ctx.is_training, scope=tf.get_variable_scope()) if ctx.has_own_variables: # Only apply update in this case. # Add these EMA to model_variables so that they will be synced # properly by replicated trainers. for v in layer.non_trainable_variables: add_model_variable(v) else: # Don't need update if we are sharing variables from an existing tower restore_collection(coll_bk) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) ret = tf.identity(xn, name='output') # TODO not sure whether to add moving_mean/moving_var to VH now vh = ret.variables = VariableHolder() if use_scale: vh.gamma = layer.gamma if use_bias: vh.beta = layer.beta return ret
def BatchNorm(inputs, axis=None, training=None, momentum=0.9, epsilon=1e-5, center=True, scale=True, beta_initializer=tf.zeros_initializer(), gamma_initializer=tf.ones_initializer(), virtual_batch_size=None, data_format='channels_last', internal_update=False, sync_statistics=None): """ Almost equivalent to `tf.layers.batch_normalization`, but different (and more powerful) in the following: 1. Accepts an alternative `data_format` option when `axis` is None. For 2D input, this argument will be ignored. 2. Default value for `momentum` and `epsilon` is different. 3. Default value for `training` is automatically obtained from tensorpack's `TowerContext`, but can be overwritten. 4. Support the `internal_update` option, which enables the use of BatchNorm layer inside conditionals. 5. Support the `sync_statistics` option, which is very useful in small-batch models. Args: internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer by control dependencies. They are very similar in speed, but `internal_update=True` can be used when you have conditionals in your model, or when you have multiple networks to train. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/14699 sync_statistics (str or None): one of None "nccl", or "horovod". By default (None), it uses statistics of the input tensor to normalize. This is the standard way BatchNorm was done in most frameworks. When set to "nccl", this layer must be used under tensorpack's multi-GPU trainers. It uses the aggregated statistics of the whole batch (across all GPUs) to normalize. When set to "horovod", this layer must be used under tensorpack's :class:`HorovodTrainer`. It uses the aggregated statistics of the whole batch (across all MPI ranks) to normalize. Note that on single machine this is significantly slower than the "nccl" implementation. This implementation averages the per-GPU E[x] and E[x^2] among GPUs to compute global mean & variance. Therefore each GPU needs to have the same batch size. This option has no effect when not training. This option is also known as "Cross-GPU BatchNorm" as mentioned in: `MegDet: A Large Mini-Batch Object Detector <https://arxiv.org/abs/1711.07240>`_. Corresponding TF issue: https://github.com/tensorflow/tensorflow/issues/18222. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: Combinations of ``training`` and ``ctx.is_training``: * ``training == ctx.is_training``: standard BN, EMA are maintained during training and used during inference. This is the default. * ``training and not ctx.is_training``: still use batch statistics in inference. * ``not training and ctx.is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ # parse shapes data_format = get_data_format(data_format, tfmode=False) shape = inputs.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4], ndims if sync_statistics is not None: sync_statistics = sync_statistics.lower() assert sync_statistics in [None, 'nccl', 'horovod'], sync_statistics if axis is None: if ndims == 2: data_format = 'NHWC' axis = 1 else: axis = 1 if data_format == 'NCHW' else 3 else: data_format = 'NCHW' if axis == 1 else 'NHWC' num_chan = shape[axis] # parse training/ctx ctx = get_current_tower_context() if training is None: training = ctx.is_training training = bool(training) TF_version = get_tf_version_tuple() if not training and ctx.is_training: assert TF_version >= (1, 4), \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn( "[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. if sync_statistics is None or not (training and ctx.is_training): coll_bk = backup_collection([tf.GraphKeys.UPDATE_OPS]) with rename_get_variable({ 'moving_mean': 'mean/EMA', 'moving_variance': 'variance/EMA' }): tf_args = dict(axis=axis, momentum=momentum, epsilon=epsilon, center=center, scale=scale, beta_initializer=beta_initializer, gamma_initializer=gamma_initializer, fused=(ndims == 4 and axis in [1, 3]), _reuse=tf.get_variable_scope().reuse) if TF_version >= (1, 5): tf_args['virtual_batch_size'] = virtual_batch_size else: assert virtual_batch_size is None, "Feature not supported in this version of TF!" layer = tf.layers.BatchNormalization(**tf_args) xn = layer.apply(inputs, training=training, scope=tf.get_variable_scope()) # maintain EMA only on one GPU is OK, even in replicated mode. # because during training, EMA isn't used if ctx.is_main_training_tower: for v in layer.non_trainable_variables: add_model_variable(v) if not ctx.is_main_training_tower or internal_update: restore_collection(coll_bk) if training and internal_update: assert layer.updates with tf.control_dependencies(layer.updates): ret = tf.identity(xn, name='output') else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=layer.moving_mean, mean=layer.moving_mean, # for backward-compatibility moving_variance=layer.moving_variance, variance=layer.moving_variance) # for backward-compatibility if scale: vh.gamma = layer.gamma if center: vh.beta = layer.beta else: red_axis = [0] if ndims == 2 else ( [0, 2, 3] if axis == 1 else [0, 1, 2]) new_shape = None # don't need to reshape unless ... if ndims == 4 and axis == 1: new_shape = [1, num_chan, 1, 1] batch_mean = tf.reduce_mean(inputs, axis=red_axis) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axis) if sync_statistics == 'nccl': if six.PY3 and TF_version <= (1, 9) and ctx.is_main_training_tower: logger.warn( "A TensorFlow bug will cause cross-GPU BatchNorm to fail. " "Apply this patch: https://github.com/tensorflow/tensorflow/pull/20360" ) from tensorflow.contrib.nccl.ops import gen_nccl_ops shared_name = re.sub('tower[0-9]+/', '', tf.get_variable_scope().name) num_dev = ctx.total if num_dev == 1: logger.warn( "BatchNorm(sync_statistics='nccl') is used with only one tower!" ) else: batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) elif sync_statistics == 'horovod': # Require https://github.com/uber/horovod/pull/331 import horovod.tensorflow as hvd batch_mean = hvd.allreduce(batch_mean, average=True) batch_mean_square = hvd.allreduce(batch_mean_square, average=True) batch_var = batch_mean_square - tf.square(batch_mean) batch_mean_vec = batch_mean batch_var_vec = batch_var beta, gamma, moving_mean, moving_var = get_bn_variables( num_chan, scale, center, beta_initializer, gamma_initializer) if new_shape is not None: batch_mean = tf.reshape(batch_mean, new_shape) batch_var = tf.reshape(batch_var, new_shape) # Using fused_batch_norm(is_training=False) is actually slightly faster, # but hopefully this call will be JITed in the future. xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, tf.reshape(beta, new_shape), tf.reshape(gamma, new_shape), epsilon) else: xn = tf.nn.batch_normalization(inputs, batch_mean, batch_var, beta, gamma, epsilon) if ctx.is_main_training_tower: ret = update_bn_ema(xn, batch_mean_vec, batch_var_vec, moving_mean, moving_var, momentum, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder( moving_mean=moving_mean, mean=moving_mean, # for backward-compatibility moving_variance=moving_var, variance=moving_var) # for backward-compatibility if scale: vh.gamma = gamma if center: vh.beta = beta return ret
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5): """ Batch normalization layer as described in: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_. :param input: a NHWC or NC tensor :param use_local_stat: bool. whether to use mean/var of this batch or the moving average. Default to True in training and False in inference. :param decay: decay rate. default to 0.9. :param epsilon: default to 1e-5. """ shape = x.get_shape().as_list() assert len(shape) in [2, 4] n_out = shape[-1] # channel assert n_out is not None beta = tf.get_variable('beta', [n_out], initializer=tf.zeros_initializer) gamma = tf.get_variable('gamma', [n_out], initializer=tf.ones_initializer) if len(shape) == 2: batch_mean, batch_var = tf.nn.moments(x, [0], keep_dims=False) else: batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], keep_dims=False) # just to make a clear name. batch_mean = tf.identity(batch_mean, 'mean') batch_var = tf.identity(batch_var, 'variance') emaname = 'EMA' ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training if use_local_stat != ctx.is_training: logger.warn("[BatchNorm] use_local_stat != is_training") if use_local_stat: # training tower with tf.name_scope( None): # https://github.com/tensorflow/tensorflow/issues/2740 ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) ema_apply_op = ema.apply([batch_mean, batch_var]) ema_mean, ema_var = ema.average(batch_mean), ema.average(batch_var) if ctx.is_main_training_tower: # inside main training tower add_model_variable(ema_mean) add_model_variable(ema_var) else: if ctx.is_main_tower: # not training, but main tower. need to create the vars with tf.name_scope(None): ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) ema_apply_op = ema.apply([batch_mean, batch_var]) ema_mean, ema_var = ema.average(batch_mean), ema.average( batch_var) else: # use statistics in another tower G = tf.get_default_graph() # figure out the var name with tf.name_scope(None): ema = tf.train.ExponentialMovingAverage(decay=decay, name=emaname) mean_var_name = ema.average_name(batch_mean) + ':0' var_var_name = ema.average_name(batch_var) + ':0' ema_mean = ctx.find_tensor_in_main_tower(G, mean_var_name) ema_var = ctx.find_tensor_in_main_tower(G, var_var_name) #logger.info("In prediction, using {} instead of {} for {}".format( #mean_name, ema_mean.name, batch_mean.name)) if use_local_stat: with tf.control_dependencies([ema_apply_op]): batch = tf.cast(tf.shape(x)[0], tf.float32) mul = tf.select(tf.equal(batch, 1.0), 1.0, batch / (batch - 1)) batch_var = batch_var * mul # use unbiased variance estimator in training return tf.nn.batch_normalization(x, batch_mean, batch_var, beta, gamma, epsilon, 'output') else: return tf.nn.batch_normalization(x, ema_mean, ema_var, beta, gamma, epsilon, 'output')
def BatchNorm(x, use_local_stat=None, decay=0.9, epsilon=1e-5, use_scale=True, use_bias=True, gamma_init=tf.constant_initializer(1.0), data_format='channels_last', internal_update=False): """ Batch Normalization layer, as described in the paper: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariance Shift <http://arxiv.org/abs/1502.03167>`_. Args: x (tf.Tensor): a 4D or 2D tensor. When 4D, the layout should match data_format. use_local_stat (bool): whether to use mean/var of the current batch or the moving average. Defaults to True in training and False in inference. decay (float): decay rate of moving average. epsilon (float): epsilon to avoid divide-by-zero. use_scale, use_bias (bool): whether to use the extra affine transformation or not. gamma_init: initializer for gamma (the scale). internal_update (bool): if False, add EMA update ops to `tf.GraphKeys.UPDATE_OPS`. If True, update EMA inside the layer which will be slightly slower. Returns: tf.Tensor: a tensor named ``output`` with the same shape of x. Variable Names: * ``beta``: the bias term. Will be zero-inited by default. * ``gamma``: the scale term. Will be one-inited by default. Input will be transformed by ``x * gamma + beta``. * ``mean/EMA``: the moving average of mean. * ``variance/EMA``: the moving average of variance. Note: 1. About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed independently. This is consistent with most frameworks. 2. Combinations of ``use_local_stat`` and ``ctx.is_training``: * ``use_local_stat == is_training``: standard BN, EMA are maintained during training and used during inference. * ``use_local_stat and not is_training``: still use local (batch) statistics in inference. * ``not use_local_stat and is_training``: use EMA to normalize in training. This is useful when you load a pre-trained BN and don't want to fine tune the EMA. EMA will not be updated in this case. """ data_format = get_data_format(data_format, tfmode=False) shape = x.get_shape().as_list() ndims = len(shape) assert ndims in [2, 4] if ndims == 2: data_format = 'NHWC' if data_format == 'NCHW': n_out = shape[1] else: n_out = shape[-1] # channel assert n_out is not None, "Input to BatchNorm cannot have unknown channels!" beta, gamma, moving_mean, moving_var = get_bn_variables(n_out, use_scale, use_bias, gamma_init) ctx = get_current_tower_context() if use_local_stat is None: use_local_stat = ctx.is_training use_local_stat = bool(use_local_stat) if use_local_stat: if ndims == 2: x = tf.reshape(x, [-1, 1, 1, n_out]) # fused_bn only takes 4D input # fused_bn has error using NCHW? (see #190) xn, batch_mean, batch_var = tf.nn.fused_batch_norm( x, gamma, beta, epsilon=epsilon, is_training=True, data_format=data_format) if ndims == 2: xn = tf.squeeze(xn, [1, 2]) else: if ctx.is_training: assert get_tf_version_number() >= 1.4, \ "Fine tuning a BatchNorm model with fixed statistics is only " \ "supported after https://github.com/tensorflow/tensorflow/pull/12580 " if ctx.is_main_training_tower: # only warn in first tower logger.warn("[BatchNorm] Using moving_mean/moving_variance in training.") # Using moving_mean/moving_variance in training, which means we # loaded a pre-trained BN and only fine-tuning the affine part. xn, _, _ = tf.nn.fused_batch_norm( x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: if ndims == 4: xn, _, _ = tf.nn.fused_batch_norm( x, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, data_format=data_format, is_training=False) else: # avoid the reshape if possible (when channel is the last dimension) xn = tf.nn.batch_normalization( x, moving_mean, moving_var, beta, gamma, epsilon) # maintain EMA only on one GPU is OK, even in replicated mode. # because training time doesn't use EMA if ctx.is_main_training_tower: add_model_variable(moving_mean) add_model_variable(moving_var) if ctx.is_main_training_tower and use_local_stat: ret = update_bn_ema(xn, batch_mean, batch_var, moving_mean, moving_var, decay, internal_update) else: ret = tf.identity(xn, name='output') vh = ret.variables = VariableHolder(mean=moving_mean, variance=moving_var) if use_scale: vh.gamma = gamma if use_bias: vh.beta = beta return ret
def sync_batch_norm(inputs, decay=0.999, center=True, scale=False, epsilon=0.001, activation_fn=None, updates_collections=tf.GraphKeys.UPDATE_OPS, is_training=True, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, num_dev=1): ''' num_dev is how many gpus you use. ''' from tensorflow.contrib.nccl.ops import gen_nccl_ops from tensorflow.contrib.framework import add_model_variable red_axises = [0, 1, 2] num_outputs = inputs.get_shape().as_list()[-1] if scope is None: scope = 'BatchNorm' layer_variable_getter = _build_variable_getter() with variable_scope.variable_scope( scope, 'BatchNorm', reuse=reuse, custom_getter=layer_variable_getter) as sc: gamma = tf.get_variable(name='gamma', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=trainable, collections=variables_collections) beta = tf.get_variable(name='beta', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=trainable, collections=variables_collections) moving_mean = tf.get_variable(name='moving_mean', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(0.0), trainable=False, collections=variables_collections) moving_var = tf.get_variable(name='moving_variance', shape=[num_outputs], dtype=tf.float32, initializer=tf.constant_initializer(1.0), trainable=False, collections=variables_collections) if is_training and trainable: if num_dev == 1: mean, var = tf.nn.moments(inputs, red_axises) else: shared_name = tf.get_variable_scope().name batch_mean = tf.reduce_mean(inputs, axis=red_axises) batch_mean_square = tf.reduce_mean(tf.square(inputs), axis=red_axises) batch_mean = gen_nccl_ops.nccl_all_reduce( input=batch_mean, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean') * (1.0 / num_dev) batch_mean_square = gen_nccl_ops.nccl_all_reduce( input=batch_mean_square, reduction='sum', num_devices=num_dev, shared_name=shared_name + '_NCCL_mean_square') * (1.0 / num_dev) mean = batch_mean var = batch_mean_square - tf.square(batch_mean) outputs = tf.nn.batch_normalization(inputs, mean, var, beta, gamma, epsilon) if int(outputs.device[-1])== 0: update_moving_mean_op = tf.assign(moving_mean, moving_mean * decay + mean * (1 - decay)) update_moving_var_op = tf.assign(moving_var, moving_var * decay + var * (1 - decay)) add_model_variable(moving_mean) add_model_variable(moving_var) if updates_collections is None: with tf.control_dependencies([update_moving_mean_op, update_moving_var_op]): outputs = tf.identity(outputs) else: ops.add_to_collections(updates_collections, update_moving_mean_op) ops.add_to_collections(updates_collections, update_moving_var_op) outputs = tf.identity(outputs) else: outputs = tf.identity(outputs) else: outputs,_,_ = nn.fused_batch_norm(inputs, gamma, beta, mean=moving_mean, variance=moving_var, epsilon=epsilon, is_training=False) if activation_fn is not None: outputs = activation_fn(outputs) return utils.collect_named_outputs(outputs_collections, sc.name, outputs)