def test_elementwise(): a = nd.ones(shape=(LARGE_X, SMALL_Y)) b = nd.ones(shape=(LARGE_X, SMALL_Y)) res = a + b assert np.sum(res[-1].asnumpy() == 2) == a.shape[1] res = a + 1 assert np.sum(res[-1].asnumpy() == 2) == a.shape[1] res = nd.sqrt(a + 3) assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
def forward(self, x): with x.context: c = nd.softmax(self.b.data(), axis=1) u = nd.dot(x, self.w.data()) s = nd.multiply(c, u) s_nrm = nd.sum(s*s) fact = s_nrm / ( 1. + s_nrm) v = fact * s / nd.sqrt(s_nrm) self.u_v = nd.sum(nd.multiply(u, v)) return u
def grad_clipping(params, clipping_norm, ctx): """Gradient clipping.""" if clipping_norm is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > clipping_norm: for p in params: p.grad[:] *= clipping_norm / norm
def pure_batch_norm(x, gamma, beta, eps=1e-5): assert len(x.shape) in (2, 4) if len(x.shape) == 2: mean = x.mean(axis=0) variance = ((x - mean)**2).mean(axis=0) else: mean = x.mean(axis=(0, 2, 3), keepdims=True) variance = ((x - mean)**2).mean(axis=(0, 2, 3), keepdims=True) x_hat = (x - mean) / nd.sqrt(variance + eps) return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
def grad_clipping(params, theta, ctx): """Gradient clipping.""" if theta is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > theta: for p in params: p.grad[:] *= theta / norm
def implement_0(self, x, label): ''' following the sphereface code of caffe ''' # weight normalize with x.context: w = self.weight.data() with mx.autograd.pause(): w_norm = w / nd.sqrt(nd.sum(nd.power(w, 2), axis=1)).reshape( (-1, 1)) w[:] = w_norm # x_norm = |x| x_norm = nd.power(x, 2) x_norm = nd.sum(x_norm, axis=1) x_norm = nd.sqrt(x_norm) # cos_theta = x'w/|x|. note: |w| = 1 cos_theta = nd.dot(x, w, transpose_b=True) cos_theta = cos_theta / x_norm.reshape((-1, 1)) # cos_theta_quadratic & cos_theta_quartic cos_theta_quadratic = cos_theta**2 cos_theta_quartic = cos_theta**4 with mx.autograd.pause(): # sign_0 = sign(cos_theta) sign_0 = nd.sign(cos_theta) # sign_3 = sign_0 * sign(2 * cos_theta_quadratic_ - 1) sign_3 = sign_0 * nd.sign(2 * cos_theta_quadratic - 1) # sign_4 = 2 * sign_0 + sign_3 - 3 sign_4 = 2 * sign_0 + sign_3 - 3 # phi_theta = (sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic + 1) + sign_4) phi_theta = sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic + 1) + sign_4 x_norm_phi_theta = x_norm.reshape((-1, 1)) * phi_theta # i=j index with mx.autograd.pause(): index = nd.one_hot(label, x_norm_phi_theta.shape[1]) # output with mx.autograd.pause(): lamb = self.__get_lambda() # 10 output = nd.dot(x, w, transpose_b=True) output2 = output * (1.0 - index) + x_norm_phi_theta * index output3 = (output2 + lamb * nd.dot(x, w, transpose_b=True)) / (1 + lamb) return output3
def squash(self, vectors, axis): epsilon = 1e-9 vectors_l2norm = nd.square(vectors).sum( axis=axis, keepdims=True) #.expand_dims(axis=axis) scale_factor = vectors_l2norm / (1 + vectors_l2norm) vectors_squashed = scale_factor * ( vectors / nd.sqrt(vectors_l2norm + epsilon)) # element-wise return vectors_squashed
def grad_clipping(params, theta, ctx): """Gradient clipping.""" if theta is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > theta: for p in params: p.grad[:] *= theta / norm
def grad_clipping(params, clipping_norm, ctx): """Gradient clipping.""" if clipping_norm is not None: norm = nd.array([0.0], ctx) for p in params: norm += nd.sum(p.grad ** 2) norm = nd.sqrt(norm).asscalar() if norm > clipping_norm: for p in params: p.grad[:] *= clipping_norm / norm
def forward(self, data, weight, mapping_label, depth): """ """ with autograd.record(): norm_data = nd.L2Normalization(data) norm_weight = nd.L2Normalization(weight) # fc7 = nd.dot(norm_data, norm_weight, transpose_b=True) # mapping_label_onehot = mx.nd.one_hot(indices=mapping_label, depth=depth, on_value=1.0, off_value=0.0) # cosface if self.loss_m1 == 1.0 and self.loss_m2 == 0.0: _one_hot = mapping_label_onehot * self.loss_m3 fc7 = fc7 - _one_hot elif self.loss_m1 == 1.0 and self.loss_m3 == 0.0: fc7_onehot = fc7 * mapping_label_onehot cos_t = fc7_onehot t = nd.arccos(cos_t) if self.loss_m1 != 1.0: t = t * self.loss_m1 if self.loss_m2 != 0.0: t = t + self.loss_m2 margin_cos = nd.cos(t) if self.loss_m3 != 0.0: margin_cos = margin_cos - self.loss_m3 margin_fc7 = margin_cos margin_fc7_onehot = margin_fc7 * mapping_label_onehot diff = margin_fc7_onehot - fc7_onehot fc7 = fc7 + diff else: cosine = fc7 sine = nd.sqrt(1 - fc7 * fc7) m = nd.array([self.loss_m2], ctx=fc7.context) # phi = cosine * nd.cos(m) - sine * nd.sin(m) cos_t = fc7_onehot t = nd.arccos(cos_t) phi = nd.cos(t + self.loss_m2) mask = cosine > phi print('mask', mask.shape) hard_example = nd.where(cosine > phi, cosine) self.t = self.t.as_in_context(fc7.context) self.t = cosine * mapping_label_onehot.mean() * 0.01 + ( 1 - 0.01) * self.t print("cosine", cosine.shape) print(self.t.shape) print('dasdasdasdad', hard_example.shape) cosine[mask] = hard_example * (self.t + hard_example) fc7 = mapping_label_onehot * phi + cosine * ( 1.0 - mapping_label_onehot) fc7 = fc7 * self.loss_s return fc7, mapping_label_onehot
def batch_norm2D(X, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): ''' 事实上,在测试时我们还是需要继续使用批量归一化的,只是需要做些改动。 在测试时,我们需要把原先训练时用到的批量均值和方差替换成**整个**训练数据的均值和方差。 但是当训练数据极大时,这个计算开销很大。因此,我们用移动平均的方法来近似计算 ''' assert len(X.shape) in (2, 4) # 全连接: batch_size x feature if len(X.shape) == 2: # 每个输入维度在样本上的平均和方差 mean = X.mean(axis=0) variance = ((X - mean)**2).mean(axis=0) # 2D卷积: batch_size x channel x height x width else: # 对每个通道算均值和方差,需要保持 4D 形状使得可以正确的广播 mean = X.mean(axis=(0, 2, 3), keepdims=True) variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 变形使得可以正确的广播 moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) # 均一化 if is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) #!!! 更新全局的均值和方差 moving_mean[:] = moving_momentum * moving_mean + ( 1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + ( 1.0 - moving_momentum) * variance else: #!!! 测试阶段使用全局的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) # 伸缩和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def forward(self, x): x_skip = x x = self.layer_norm(x) q = x k = x v = x dk = nd.sqrt(len(nd.shape(k))) qk = nd.softmax(q * k.T * 1. / dk) qkv = qk * v x = qkv + x_skip return x
def log_rmse(net, features, labels): """ 使用对数均方误差评价模型 :param net: :param features: :param labels: :return: """ clipped_preds = nd.clip(net(features), 1, float('inf')) rmse = nd.sqrt(2 * loss(clipped_preds.log(), labels.log()).mean()) return rmse.asscalar()
def normal(): """ 它的每个元素都随机采样于均值为0、标准差为1的正态分布。nd.sqrt(nd.power(a, 2).sum()) :return: """ n = nd.normal(0, 1, shape=(2, 2)) logger.info(n) a = nd.array([1, 2, 3, 4]) print(a.norm()) print(nd.sqrt(nd.power(a, 2).sum()))
def pure_batch_norm(X, gamma, beta, eps=1e-5): if len(X.shape) not in (2, 4): raise ValueError('only supports dense or 2dconv') print("gamma", gamma) print("beta", beta) # dense if len(X.shape) == 2: C, N = X.shape # mini-batch mean # mini-batch mean mean = nd.mean(X, axis=0) print("mean:", mean) # mini-batch variance variance = nd.mean((X - mean)**2, axis=0) print("var:", variance) # normalize X_hat = (X - mean) * 1.0 / nd.sqrt(variance + eps) # scale and shift out = gamma * X_hat + beta # 2d conv elif len(X.shape) == 4: # extract the dimensions N, C, H, W = X.shape # mini-batch mean mean = nd.mean(X, axis=(0, 2, 3)) print("mean", mean) # mini-batch variance variance = nd.mean((X - mean.reshape((1, C, 1, 1)))**2, axis=(0, 2, 3)) print("variance", variance) # normalize X_hat = (X - mean.reshape( (1, C, 1, 1))) * 1.0 / nd.sqrt(variance.reshape((1, C, 1, 1)) + eps) #X_hat = (X - mean.reshape((1, C, 1, 1))) print("X_hat", X_hat) #print(X_hat) # scale and shift out = gamma.reshape((1, C, 1, 1)) * X_hat + beta.reshape((1, C, 1, 1)) return out
def forward(self, x): if autograd.is_training(): _, *tmp = x.shape self.gamma.shape = [1] + tmp self.gamma._finish_deferred_init() self.beta.shape = [1] + tmp self.beta._finish_deferred_init() mu = x.mean(axis=1, keepdims=True) sigma = nd.sqrt(((x - mu)**2).mean(axis=1, keepdims=True)) return ((x - mu) / (sigma + self.eps)) * self.gamma.data() + self.beta.data()
def batch_norm(x, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): assert len(x.shape) in (2, 4) if len(x.shape) == 2: mean = x.mean(axis=0) variance = ((x - mean) ** 2).mean(axis=0) else: mean = x.mean(axis=(0, 2, 3), keepdim=True) variance = ((x - mean) ** 2).mean(axis=(0, 2, 3), keepdim=True) # make sure the boardcasting machism moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) if is_training: x_hat = (x - mean) / nd.sqrt(variance + eps) # update the global mean and variance moving_mean[:] = moving_momentum * moving_mean + (1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + (1.0 - moving_momentum) * variance else: # testing: using the training stage mean and variance x_hat = (x - moving_mean) / nd.sqrt(moving_variance + eps) return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
def pure_batch_norm(x, gamma, beta, eps=1e-5): assert len(x.shape) in (2, 4) if len(x.shape) == 2: # fc layer: batch * feature mean = x.mean(axis=0) variance = ((x - mean) ** 2).mean(axis=0) else: # 2D Tensor: batch * channel * height * width mean = x.mean(axis=(0, 2, 3), keepdim=True) variance = ((x - mean) ** 2).mean(axis=(0, 2, 3), keepdim=True) x_hat = (x - mean) / nd.sqrt(variance + eps) return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
def squash(self, vectors, axis): epsilon = 1e-9 vectors_l2norm = nd.square(vectors).sum(axis=axis, keepdims=True) assert vectors_l2norm.shape == (self.batch_size, 1, self.num_capsule, 1, 1) # 1,10,1,1 scale_factor = vectors_l2norm / (1 + vectors_l2norm) vectors_squashed = scale_factor * ( vectors / nd.sqrt(vectors_l2norm + epsilon)) # element-wise return vectors_squashed
def sqrt(self, tensor_in): """ Element-wise square-root value of the input. Args: tensor_in (Tensor): Tensor object Returns: MXNet NDArray: Element-wise square-root value. """ tensor_in = self.astensor(tensor_in) return nd.sqrt(tensor_in)
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): # 通过autograd来判断当前模式是训练模式还是预测模式 if not autograd.is_training(): # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使用全连接层的情况,计算特征维上的均值和方差 mean = X.mean(axis=0) var = ((X - mean)**2).mean(axis=0) else: # 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差 mean = X.mean(axis=(0, 2, 3), keepdims=True) var = ((X - mean)**2).mean(axis=0) X_hat = (X - mean) / nd.sqrt(var + eps) # 更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta return Y, moving_mean, moving_var
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): assert len(X.shape) in (2, 4) # batch_size x feature if len(X.shape) == 2: mean = X.mean(axis=0) # print(mean) variance = ((X - mean)**2).mean(axis=0) # 2D convolution: batch_size x channels x height x weight else: mean = X.mean( axis=(0, 2, 3), keepdims=True) # compute mean and variance for each channel # print(mean) variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # for correct moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) # normalization if is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) # update global mean and variance moving_mean[:] = moving_momentum * moving_mean + ( 1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + ( 1.0 - moving_momentum) * variance else: X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def forward(self, is_train, req, in_data, out_data, aux): x = in_data[0] gamma = in_data[1] beta = in_data[2] moving_mean = in_data[3] moving_var = in_data[4] # print(x.sum()) y = out_data[0] if is_train: mean = nd.mean(x, axis=(0, 2, 3)) var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3))) #print(moving_mean ,self.momentum, mean) moving_mean = moving_mean * self.momentum + mean * (1 - self.momentum) moving_var = moving_var * self.momentum + var * (1 - self.momentum) self.assign(in_data[3], req[0], moving_mean) self.assign(in_data[4], req[0], moving_var) else: mean = moving_mean var = moving_var quan_gamma = self.quantize(gamma / (nd.sqrt(var + self.eps))) quan_beta = self.quantize(beta - mean * gamma / nd.sqrt(var + self.eps)) y = nd.BatchNorm(x, gamma=quan_gamma, beta=quan_beta, moving_mean=nd.zeros(shape=moving_mean.shape), moving_var=nd.ones(shape=moving_var.shape), eps=self.eps, momentum=self.momentum, fix_gamma=self.fix_gamma, name=self.name) self.assign(out_data[0], req[0], mx.nd.array(y))
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): if not autograd.record(): # 预测模式,直接使用传入的移动平均得到均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) # 长度只能取2或4 if len(X.shape) == 2: # 全连接层,计算特征维上(axis=0)的均值和方差 mean = X.mean(axis=0) # axis=0: 一axis=0axis=0列算一个均值 var = ((X - mean)**2).mean(axis=0) else: # 二维卷基层,计算通道上(axis=1)的均值和方差 mean = X.mean(axis=(0, 2, 3), keepdims=True) var = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 训练模式, 计算当前的均值和方差 X_hat = (X - mean) / nd.sqrt(var + eps) # 移动平均-更新移动平均的均值和方差 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta # 拉伸和偏移 return Y, moving_mean, moving_var
def pure_batch_norm(X, gamma, beta, eps=1e-5): assert len(X.shape) in (2, 4) # fully connect: batch_size * feature if len(X.shape) == 2: mean = X.mean(axis=0) # mean in batch_size-direction, and each feature has a mean variance = ((X-mean)**2).mean(axis=0) # 2D conv else: # mean in batch-direction, each channel has a mean mean = X.mean(axis=(0, 2, 3), keepdims=True) varaince = ((X-mean)**2).mean(axis=(0, 2, 3), keepdims=True) X_hat = (X-mean) / nd.sqrt(varaince + eps) return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape) # reshape?
def getwh(scales, ratios, fw, fh, srmode): if srmode == 'few': num = scales.size + ratios.size - 1 width = nd.zeros((num,)) height = nd.zeros((num,)) sqt_ratios = nd.sqrt(ratios) width[:ratios.size] = scales[0] * sqt_ratios height[:ratios.size] = width[:ratios.size] / ratios width[ratios.size:] = scales[1:] * sqt_ratios[0] height[ratios.size:] = width[ratios.size:] / ratios[0] else: rscales = nd.repeat(scales, ratios.size) rratios = nd.tile(ratios, scales.size) width = rscales * nd.sqrt(rratios) height = width / rratios width = width * fw height = height * fh return width, height
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum): # 通过 autograd 来判断当前模式为训练模式或预测模式。 if not autograd.is_training(): # 如果是在预测模式下,直接使⽤传⼊的移动平均所得的均值和⽅差。 X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps) else: assert len(X.shape) in (2, 4) if len(X.shape) == 2: # 使⽤全连接层的情况,计算特征维上的均值和⽅差。 mean = X.mean(axis=0) var = ((X - mean) ** 2).mean(axis=0) else: # 使⽤⼆维卷积层的情况,计算通道维上(axis=1)的均值和⽅差。这⾥我们需要 # 保持 X 的形状以便后⾯可以做⼴播运算。 mean = X.mean(axis=(0, 2, 3), keepdims=True) var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True) # 训练模式下⽤当前的均值和⽅差做标准化。 X_hat = (X - mean) / nd.sqrt(var + eps) # 更新移动平均的均值和⽅差。 moving_mean = momentum * moving_mean + (1.0 - momentum) * mean moving_var = momentum * moving_var + (1.0 - momentum) * var Y = gamma * X_hat + beta # 拉伸和偏移。 return Y, moving_mean, moving_var
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9): assert len(X.shape) in (2, 4) # 全连接: batch_size x feature if len(X.shape) == 2: # 每个输入维度在样本上的平均和方差 mean = X.mean(axis=0) variance = ((X - mean)**2).mean(axis=0) # 2D卷积: batch_size x channel x height x width else: # 对每个通道算均值和方差,需要保持4D形状使得可以正确的广播 mean = X.mean(axis=(0, 2, 3), keepdims=True) variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 变形使得可以正确的广播 moving_mean = moving_mean.reshape(mean.shape) moving_variance = moving_variance.reshape(mean.shape) # 均一化 if is_training: X_hat = (X - mean) / nd.sqrt(variance + eps) #!!! 更新全局的均值和方差 moving_mean[:] = moving_momentum * moving_mean + ( 1.0 - moving_momentum) * mean moving_variance[:] = moving_momentum * moving_variance + ( 1.0 - moving_momentum) * variance else: #!!! 测试阶段使用全局的均值和方差 X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps) # 拉升和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def normalize(feature_map): """ :param feature_map: either F_a or F_bp :return: normalized feature map response """ response = nd.sum(feature_map * feature_map, axis=1, keepdims=True) normed_feature_map = feature_map / nd.sqrt(response) # response should be scaled to (0, 1) response = (response - nd.min(response)) / (nd.max(response) - nd.min(response)) # When the array is on a device, ordinary operations do not change the storage location of the array return normed_feature_map, response
def _merge_bn_to_condconv2d(m): if isinstance(m, CondConv2D): base_name = m.name.replace(conv_name, bn_name) print(f"Merge {base_name} to {m.name}") gamma = bn_collections[base_name + "_gamma"] beta = bn_collections[base_name + "_beta"] mean = bn_collections[base_name + "_running_mean"] var = bn_collections[base_name + "_running_var"] weight = m.weight.data() w_shape = m.weight.shape m.weight.set_data((weight.reshape(0, 0, -1) * gamma.reshape(0, 0, 1) \ / nd.sqrt(var + 1e-10).reshape(0, 0, 1)).reshape(w_shape)) if m.bias is None: m._kwargs['no_bias'] = False m.bias = m.params.get('bias', shape=w_shape[:2], init="zeros", allow_deferred_init=True) m.bias.initialize() finished_params.append(m.bias.name) bias = m.bias.data() m.bias.set_data(gamma * (bias - mean) / nd.sqrt(var + 1e-10) + beta)
def adam(params, vs, sqrs, lr, batch_size, t): beta1 = 0.9 beta2 = 0.999 eps_stable = 1e-8 for param, v, sqr in zip(params, vs, sqrs): g = param.grad / batch_size v[:] = beta1 * v + (1. - beta1) * g sqr[:] = beta2 * sqr + (1. - beta2) * nd.square(g) v_bias_corr = v / (1. - beta1 ** t) sqr_bias_corr = sqr / (1. - beta2 ** t) div = lr * v_bias_corr / (nd.sqrt(sqr_bias_corr) + eps_stable) param[:] = param - div
def select_action(self, state): with autograd.record(): mu, sigma_sq = self.model(state.as_in_context(model_ctx)) # sigma_sq = nd.softrelu(sigma_sq) # the implementation of softplus sigma_sq = nd.log(1 + nd.exp(sigma_sq)) eps = nd.random.normal(0, 1, mu.shape, dtype=np.float32) # calculate the probability action = mu + nd.sqrt(sigma_sq) * eps prob = normal(action, mu, sigma_sq) entropy = -0.5 * (np.log(sigma_sq + math.pi * 2) + 1) log_prob = nd.log(prob) return action, log_prob, entropy
def pure_batch_norm(X, gamma, beta, eps=1e-5): assert len(X.shape) in (2, 4) # 全连接: batch_size x feature if len(X.shape) == 2: # 每个输入维度在样本上的平均和方差 mean = X.mean(axis=0, keepdims=True) variance = ((X - mean)**2).mean(axis=0, keepdims=True) # 2D卷积: batch_size x channel x height x width else: # 对每个通道算均值和方差,需要保持4D形状使得可以正确地广播 mean = X.mean(axis=(0, 2, 3), keepdims=True) variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True) # 均一化 X_hat = (X - mean) / nd.sqrt(variance + eps) # 拉升和偏移 return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
def get_distance_matrix(x): """Get distance matrix given a matrix. Used in testing.""" square = nd.sum(x ** 2.0, axis=1, keepdims=True) distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose())) return nd.sqrt(distance_square)