Esempio n. 1
0
def test_elementwise():
    a = nd.ones(shape=(LARGE_X, SMALL_Y))
    b = nd.ones(shape=(LARGE_X, SMALL_Y))
    res = a + b
    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
    res = a + 1
    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
    res = nd.sqrt(a + 3)
    assert np.sum(res[-1].asnumpy() == 2) == a.shape[1]
Esempio n. 2
0
 def forward(self, x):
     with x.context:
         c = nd.softmax(self.b.data(), axis=1)
         u = nd.dot(x, self.w.data())
         s = nd.multiply(c, u)
         s_nrm = nd.sum(s*s)
         fact = s_nrm / ( 1. + s_nrm)
         v = fact * s / nd.sqrt(s_nrm)
         self.u_v = nd.sum(nd.multiply(u, v))
         return u
def grad_clipping(params, clipping_norm, ctx):
    """Gradient clipping."""
    if clipping_norm is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad ** 2)
        norm = nd.sqrt(norm).asscalar()
        if norm > clipping_norm:
            for p in params:
                p.grad[:] *= clipping_norm / norm
def pure_batch_norm(x, gamma, beta, eps=1e-5):
    assert len(x.shape) in (2, 4)
    if len(x.shape) == 2:
        mean = x.mean(axis=0)
        variance = ((x - mean)**2).mean(axis=0)
    else:
        mean = x.mean(axis=(0, 2, 3), keepdims=True)
        variance = ((x - mean)**2).mean(axis=(0, 2, 3), keepdims=True)
    x_hat = (x - mean) / nd.sqrt(variance + eps)
    return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
Esempio n. 5
0
def grad_clipping(params, theta, ctx):
    """Gradient clipping."""
    if theta is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad ** 2)
        norm = nd.sqrt(norm).asscalar()
        if norm > theta:
            for p in params:
                p.grad[:] *= theta / norm
 def implement_0(self, x, label):
     '''
     following the sphereface code of caffe
     '''
     #  weight normalize
     with x.context:
         w = self.weight.data()
     with mx.autograd.pause():
         w_norm = w / nd.sqrt(nd.sum(nd.power(w, 2), axis=1)).reshape(
             (-1, 1))
         w[:] = w_norm
     #  x_norm = |x|
     x_norm = nd.power(x, 2)
     x_norm = nd.sum(x_norm, axis=1)
     x_norm = nd.sqrt(x_norm)
     #  cos_theta = x'w/|x|. note: |w| = 1
     cos_theta = nd.dot(x, w, transpose_b=True)
     cos_theta = cos_theta / x_norm.reshape((-1, 1))
     #  cos_theta_quadratic & cos_theta_quartic
     cos_theta_quadratic = cos_theta**2
     cos_theta_quartic = cos_theta**4
     with mx.autograd.pause():
         #  sign_0 = sign(cos_theta)
         sign_0 = nd.sign(cos_theta)
         #  sign_3 = sign_0 * sign(2 * cos_theta_quadratic_ - 1)
         sign_3 = sign_0 * nd.sign(2 * cos_theta_quadratic - 1)
         #  sign_4 = 2 * sign_0 + sign_3 - 3
         sign_4 = 2 * sign_0 + sign_3 - 3
     #  phi_theta = (sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic + 1) + sign_4)
     phi_theta = sign_3 * (8 * cos_theta_quartic - 8 * cos_theta_quadratic +
                           1) + sign_4
     x_norm_phi_theta = x_norm.reshape((-1, 1)) * phi_theta
     #  i=j index
     with mx.autograd.pause():
         index = nd.one_hot(label, x_norm_phi_theta.shape[1])
     #  output
     with mx.autograd.pause():
         lamb = self.__get_lambda()  # 10
     output = nd.dot(x, w, transpose_b=True)
     output2 = output * (1.0 - index) + x_norm_phi_theta * index
     output3 = (output2 + lamb * nd.dot(x, w, transpose_b=True)) / (1 +
                                                                    lamb)
     return output3
Esempio n. 7
0
    def squash(self, vectors, axis):
        epsilon = 1e-9
        vectors_l2norm = nd.square(vectors).sum(
            axis=axis, keepdims=True)  #.expand_dims(axis=axis)

        scale_factor = vectors_l2norm / (1 + vectors_l2norm)
        vectors_squashed = scale_factor * (
            vectors / nd.sqrt(vectors_l2norm + epsilon))  # element-wise

        return vectors_squashed
def grad_clipping(params, theta, ctx):
    """Gradient clipping."""
    if theta is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad ** 2)
        norm = nd.sqrt(norm).asscalar()
        if norm > theta:
            for p in params:
                p.grad[:] *= theta / norm
Esempio n. 9
0
def grad_clipping(params, clipping_norm, ctx):
    """Gradient clipping."""
    if clipping_norm is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad ** 2)
        norm = nd.sqrt(norm).asscalar()
        if norm > clipping_norm:
            for p in params:
                p.grad[:] *= clipping_norm / norm
Esempio n. 10
0
    def forward(self, data, weight, mapping_label, depth):
        """
        """
        with autograd.record():
            norm_data = nd.L2Normalization(data)
            norm_weight = nd.L2Normalization(weight)
            #
            fc7 = nd.dot(norm_data, norm_weight, transpose_b=True)
            #
            mapping_label_onehot = mx.nd.one_hot(indices=mapping_label,
                                                 depth=depth,
                                                 on_value=1.0,
                                                 off_value=0.0)
            # cosface
            if self.loss_m1 == 1.0 and self.loss_m2 == 0.0:
                _one_hot = mapping_label_onehot * self.loss_m3
                fc7 = fc7 - _one_hot
            elif self.loss_m1 == 1.0 and self.loss_m3 == 0.0:
                fc7_onehot = fc7 * mapping_label_onehot
                cos_t = fc7_onehot
                t = nd.arccos(cos_t)
                if self.loss_m1 != 1.0:
                    t = t * self.loss_m1
                if self.loss_m2 != 0.0:
                    t = t + self.loss_m2
                margin_cos = nd.cos(t)
                if self.loss_m3 != 0.0:
                    margin_cos = margin_cos - self.loss_m3
                margin_fc7 = margin_cos
                margin_fc7_onehot = margin_fc7 * mapping_label_onehot
                diff = margin_fc7_onehot - fc7_onehot
                fc7 = fc7 + diff
            else:
                cosine = fc7
                sine = nd.sqrt(1 - fc7 * fc7)
                m = nd.array([self.loss_m2], ctx=fc7.context)
                # phi = cosine * nd.cos(m) - sine * nd.sin(m)
                cos_t = fc7_onehot
                t = nd.arccos(cos_t)
                phi = nd.cos(t + self.loss_m2)
                mask = cosine > phi
                print('mask', mask.shape)
                hard_example = nd.where(cosine > phi, cosine)
                self.t = self.t.as_in_context(fc7.context)
                self.t = cosine * mapping_label_onehot.mean() * 0.01 + (
                    1 - 0.01) * self.t
                print("cosine", cosine.shape)
                print(self.t.shape)
                print('dasdasdasdad', hard_example.shape)
                cosine[mask] = hard_example * (self.t + hard_example)
                fc7 = mapping_label_onehot * phi + cosine * (
                    1.0 - mapping_label_onehot)

            fc7 = fc7 * self.loss_s
            return fc7, mapping_label_onehot
Esempio n. 11
0
def batch_norm2D(X,
                 gamma,
                 beta,
                 is_training,
                 moving_mean,
                 moving_variance,
                 eps=1e-5,
                 moving_momentum=0.9):
    '''
    事实上,在测试时我们还是需要继续使用批量归一化的,只是需要做些改动。
    在测试时,我们需要把原先训练时用到的批量均值和方差替换成**整个**训练数据的均值和方差。
    但是当训练数据极大时,这个计算开销很大。因此,我们用移动平均的方法来近似计算
    '''
    assert len(X.shape) in (2, 4)
    # 全连接: batch_size x feature
    if len(X.shape) == 2:
        # 每个输入维度在样本上的平均和方差
        mean = X.mean(axis=0)
        variance = ((X - mean)**2).mean(axis=0)
    # 2D卷积: batch_size x channel x height x width
    else:
        # 对每个通道算均值和方差,需要保持 4D 形状使得可以正确的广播
        mean = X.mean(axis=(0, 2, 3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)
        # 变形使得可以正确的广播
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)

    # 均一化
    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        #!!! 更新全局的均值和方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        #!!! 测试阶段使用全局的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 伸缩和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Esempio n. 12
0
 def forward(self, x):
     x_skip = x
     x = self.layer_norm(x)
     q = x
     k = x
     v = x
     dk = nd.sqrt(len(nd.shape(k)))
     qk = nd.softmax(q * k.T * 1. / dk)
     qkv = qk * v
     x = qkv + x_skip
     return x
Esempio n. 13
0
def log_rmse(net, features, labels):
    """
    使用对数均方误差评价模型
    :param net:
    :param features:
    :param labels:
    :return:
    """
    clipped_preds = nd.clip(net(features), 1, float('inf'))
    rmse = nd.sqrt(2 * loss(clipped_preds.log(), labels.log()).mean())
    return rmse.asscalar()
Esempio n. 14
0
def normal():
    """
    它的每个元素都随机采样于均值为0、标准差为1的正态分布。nd.sqrt(nd.power(a, 2).sum())
    :return:
    """
    n = nd.normal(0, 1, shape=(2, 2))
    logger.info(n)

    a = nd.array([1, 2, 3, 4])
    print(a.norm())
    print(nd.sqrt(nd.power(a, 2).sum()))
Esempio n. 15
0
def pure_batch_norm(X, gamma, beta, eps=1e-5):
    if len(X.shape) not in (2, 4):
        raise ValueError('only supports dense or 2dconv')

    print("gamma", gamma)
    print("beta", beta)
    # dense
    if len(X.shape) == 2:
        C, N = X.shape
        # mini-batch mean
        # mini-batch mean
        mean = nd.mean(X, axis=0)
        print("mean:", mean)
        # mini-batch variance
        variance = nd.mean((X - mean)**2, axis=0)
        print("var:", variance)
        # normalize
        X_hat = (X - mean) * 1.0 / nd.sqrt(variance + eps)
        # scale and shift
        out = gamma * X_hat + beta

    # 2d conv
    elif len(X.shape) == 4:
        # extract the dimensions
        N, C, H, W = X.shape
        # mini-batch mean
        mean = nd.mean(X, axis=(0, 2, 3))
        print("mean", mean)
        # mini-batch variance
        variance = nd.mean((X - mean.reshape((1, C, 1, 1)))**2, axis=(0, 2, 3))
        print("variance", variance)
        # normalize
        X_hat = (X - mean.reshape(
            (1, C, 1,
             1))) * 1.0 / nd.sqrt(variance.reshape((1, C, 1, 1)) + eps)
        #X_hat = (X - mean.reshape((1, C, 1, 1)))
        print("X_hat", X_hat)
        #print(X_hat)
        # scale and shift
        out = gamma.reshape((1, C, 1, 1)) * X_hat + beta.reshape((1, C, 1, 1))
    return out
Esempio n. 16
0
    def forward(self, x):
        if autograd.is_training():
            _, *tmp = x.shape
            self.gamma.shape = [1] + tmp
            self.gamma._finish_deferred_init()
            self.beta.shape = [1] + tmp
            self.beta._finish_deferred_init()

        mu = x.mean(axis=1, keepdims=True)
        sigma = nd.sqrt(((x - mu)**2).mean(axis=1, keepdims=True))
        return ((x - mu) /
                (sigma + self.eps)) * self.gamma.data() + self.beta.data()
Esempio n. 17
0
def batch_norm(x, gamma, beta, is_training, moving_mean, moving_variance, eps=1e-5, moving_momentum=0.9):
    assert len(x.shape) in (2, 4)
    if len(x.shape) == 2:
        mean = x.mean(axis=0)
        variance = ((x - mean) ** 2).mean(axis=0)
    else:
        mean = x.mean(axis=(0, 2, 3), keepdim=True)
        variance = ((x - mean) ** 2).mean(axis=(0, 2, 3), keepdim=True)
        # make sure the boardcasting machism
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)
    if is_training:
        x_hat = (x - mean) / nd.sqrt(variance + eps)
        # update the global mean and variance
        moving_mean[:] = moving_momentum * moving_mean + (1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (1.0 - moving_momentum) * variance
    else:
        # testing: using the training stage mean and variance
        x_hat = (x - moving_mean) / nd.sqrt(moving_variance + eps)

    return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
Esempio n. 18
0
def pure_batch_norm(x, gamma, beta, eps=1e-5):
    assert len(x.shape) in (2, 4)
    if len(x.shape) == 2:  # fc layer: batch * feature
        mean = x.mean(axis=0)
        variance = ((x - mean) ** 2).mean(axis=0)
    else:
        # 2D Tensor: batch * channel * height * width
        mean = x.mean(axis=(0, 2, 3), keepdim=True)
        variance = ((x - mean) ** 2).mean(axis=(0, 2, 3), keepdim=True)

    x_hat = (x - mean) / nd.sqrt(variance + eps)
    return gamma.reshape(mean.shape) * x_hat + beta.reshape(mean.shape)
Esempio n. 19
0
    def squash(self, vectors, axis):
        epsilon = 1e-9
        vectors_l2norm = nd.square(vectors).sum(axis=axis, keepdims=True)

        assert vectors_l2norm.shape == (self.batch_size, 1, self.num_capsule,
                                        1, 1)  # 1,10,1,1

        scale_factor = vectors_l2norm / (1 + vectors_l2norm)
        vectors_squashed = scale_factor * (
            vectors / nd.sqrt(vectors_l2norm + epsilon))  # element-wise

        return vectors_squashed
Esempio n. 20
0
    def sqrt(self, tensor_in):
        """
        Element-wise square-root value of the input.

        Args:
            tensor_in (Tensor): Tensor object

        Returns:
            MXNet NDArray: Element-wise square-root value.
        """
        tensor_in = self.astensor(tensor_in)
        return nd.sqrt(tensor_in)
Esempio n. 21
0
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 通过autograd来判断当前模式是训练模式还是预测模式
    if not autograd.is_training():
        # 如果是在预测模式下,直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使用全连接层的情况,计算特征维上的均值和方差
            mean = X.mean(axis=0)
            var = ((X - mean)**2).mean(axis=0)
        else:
            # 使用二维卷积层的情况,计算通道维上(axis=1)的均值和方差
            mean = X.mean(axis=(0, 2, 3), keepdims=True)
            var = ((X - mean)**2).mean(axis=0)
        X_hat = (X - mean) / nd.sqrt(var + eps)
        # 更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta
    return Y, moving_mean, moving_var
Esempio n. 22
0
def batch_norm(X,
               gamma,
               beta,
               is_training,
               moving_mean,
               moving_variance,
               eps=1e-5,
               moving_momentum=0.9):
    assert len(X.shape) in (2, 4)

    # batch_size x feature
    if len(X.shape) == 2:
        mean = X.mean(axis=0)
        # print(mean)
        variance = ((X - mean)**2).mean(axis=0)
# 2D convolution: batch_size x channels x height x weight
    else:
        mean = X.mean(
            axis=(0, 2, 3),
            keepdims=True)  # compute mean and variance for each channel
        # print(mean)
        variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)

        # for correct
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)

    # normalization

    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        # update global mean and variance
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Esempio n. 23
0
    def forward(self, is_train, req, in_data, out_data, aux):
        x = in_data[0]
        gamma = in_data[1]
        beta = in_data[2]
        moving_mean = in_data[3]
        moving_var = in_data[4]
        # print(x.sum())
        y = out_data[0]

        if is_train:
            mean = nd.mean(x, axis=(0, 2, 3))
            var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3)))
            #print(moving_mean ,self.momentum, mean)
            moving_mean = moving_mean * self.momentum + mean * (1 -
                                                                self.momentum)
            moving_var = moving_var * self.momentum + var * (1 - self.momentum)
            self.assign(in_data[3], req[0], moving_mean)
            self.assign(in_data[4], req[0], moving_var)

        else:
            mean = moving_mean
            var = moving_var

        quan_gamma = self.quantize(gamma / (nd.sqrt(var + self.eps)))
        quan_beta = self.quantize(beta -
                                  mean * gamma / nd.sqrt(var + self.eps))

        y = nd.BatchNorm(x,
                         gamma=quan_gamma,
                         beta=quan_beta,
                         moving_mean=nd.zeros(shape=moving_mean.shape),
                         moving_var=nd.ones(shape=moving_var.shape),
                         eps=self.eps,
                         momentum=self.momentum,
                         fix_gamma=self.fix_gamma,
                         name=self.name)

        self.assign(out_data[0], req[0], mx.nd.array(y))
Esempio n. 24
0
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    if not autograd.record():
        # 预测模式,直接使用传入的移动平均得到均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)  # 长度只能取2或4
        if len(X.shape) == 2:
            # 全连接层,计算特征维上(axis=0)的均值和方差
            mean = X.mean(axis=0)  # axis=0: 一axis=0axis=0列算一个均值
            var = ((X - mean)**2).mean(axis=0)
        else:
            # 二维卷基层,计算通道上(axis=1)的均值和方差
            mean = X.mean(axis=(0, 2, 3), keepdims=True)
            var = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)

        # 训练模式, 计算当前的均值和方差
        X_hat = (X - mean) / nd.sqrt(var + eps)

        # 移动平均-更新移动平均的均值和方差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta  # 拉伸和偏移
    return Y, moving_mean, moving_var
Esempio n. 25
0
def pure_batch_norm(X, gamma, beta, eps=1e-5):
    assert len(X.shape) in (2, 4)
    # fully connect: batch_size * feature
    if len(X.shape) == 2:
        mean = X.mean(axis=0)   # mean in batch_size-direction, and each feature has a mean
        variance = ((X-mean)**2).mean(axis=0)
    # 2D conv
    else:
        # mean in batch-direction, each channel has a mean
        mean = X.mean(axis=(0, 2, 3), keepdims=True)
        varaince = ((X-mean)**2).mean(axis=(0, 2, 3), keepdims=True)
        
    X_hat = (X-mean) / nd.sqrt(varaince + eps)
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)     # reshape?
Esempio n. 26
0
def getwh(scales, ratios, fw, fh, srmode):
    if srmode == 'few':
        num = scales.size + ratios.size - 1
        width = nd.zeros((num,))
        height = nd.zeros((num,))
        
        sqt_ratios = nd.sqrt(ratios)
        width[:ratios.size] = scales[0] * sqt_ratios
        height[:ratios.size] = width[:ratios.size] / ratios
        
        width[ratios.size:] = scales[1:] * sqt_ratios[0]
        height[ratios.size:] = width[ratios.size:] / ratios[0]
    else:
        rscales = nd.repeat(scales, ratios.size)
        rratios = nd.tile(ratios, scales.size)
        
        width = rscales * nd.sqrt(rratios)
        height = width / rratios
        
    width = width * fw
    height = height * fh
    
    return width, height
Esempio n. 27
0
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 通过 autograd 来判断当前模式为训练模式或预测模式。
    if not autograd.is_training():
    # 如果是在预测模式下,直接使⽤传⼊的移动平均所得的均值和⽅差。
        X_hat = (X - moving_mean) / nd.sqrt(moving_var + eps)
    else:
        assert len(X.shape) in (2, 4)
        if len(X.shape) == 2:
            # 使⽤全连接层的情况,计算特征维上的均值和⽅差。
            mean = X.mean(axis=0)
            var = ((X - mean) ** 2).mean(axis=0)
        else:
            # 使⽤⼆维卷积层的情况,计算通道维上(axis=1)的均值和⽅差。这⾥我们需要
            # 保持 X 的形状以便后⾯可以做⼴播运算。
            mean = X.mean(axis=(0, 2, 3), keepdims=True)
            var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
        # 训练模式下⽤当前的均值和⽅差做标准化。
        X_hat = (X - mean) / nd.sqrt(var + eps)
        # 更新移动平均的均值和⽅差。
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    Y = gamma * X_hat + beta # 拉伸和偏移。
    return Y, moving_mean, moving_var
Esempio n. 28
0
def batch_norm(X,
               gamma,
               beta,
               is_training,
               moving_mean,
               moving_variance,
               eps=1e-5,
               moving_momentum=0.9):
    assert len(X.shape) in (2, 4)
    # 全连接: batch_size x feature
    if len(X.shape) == 2:
        # 每个输入维度在样本上的平均和方差
        mean = X.mean(axis=0)
        variance = ((X - mean)**2).mean(axis=0)
    # 2D卷积: batch_size x channel x height x width
    else:
        # 对每个通道算均值和方差,需要保持4D形状使得可以正确的广播
        mean = X.mean(axis=(0, 2, 3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)
        # 变形使得可以正确的广播
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)

    # 均一化
    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        #!!! 更新全局的均值和方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        #!!! 测试阶段使用全局的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Esempio n. 29
0
def normalize(feature_map):
    """

    :param feature_map: either F_a or F_bp
    :return:
    normalized feature map
    response
    """
    response = nd.sum(feature_map * feature_map, axis=1, keepdims=True)
    normed_feature_map = feature_map / nd.sqrt(response)
    # response should be scaled to (0, 1)
    response = (response - nd.min(response)) / (nd.max(response) -
                                                nd.min(response))
    # When the array is on a device, ordinary operations do not change the storage location of the array
    return normed_feature_map, response
Esempio n. 30
0
    def _merge_bn_to_condconv2d(m):
        if isinstance(m, CondConv2D):
            base_name = m.name.replace(conv_name, bn_name)
            print(f"Merge {base_name} to {m.name}")
            gamma = bn_collections[base_name + "_gamma"]
            beta = bn_collections[base_name + "_beta"]
            mean = bn_collections[base_name + "_running_mean"]
            var = bn_collections[base_name + "_running_var"]

            weight = m.weight.data()
            w_shape = m.weight.shape
            m.weight.set_data((weight.reshape(0, 0, -1) * gamma.reshape(0, 0, 1) \
                                  / nd.sqrt(var + 1e-10).reshape(0, 0, 1)).reshape(w_shape))
            if m.bias is None:
                m._kwargs['no_bias'] = False
                m.bias = m.params.get('bias',
                                      shape=w_shape[:2],
                                      init="zeros",
                                      allow_deferred_init=True)
                m.bias.initialize()
                finished_params.append(m.bias.name)
            bias = m.bias.data()
            m.bias.set_data(gamma * (bias - mean) / nd.sqrt(var + 1e-10) +
                            beta)
Esempio n. 31
0
    def adam(params, vs, sqrs, lr, batch_size, t):
        beta1 = 0.9
        beta2 = 0.999
        eps_stable = 1e-8

        for param, v, sqr in zip(params, vs, sqrs):
            g = param.grad / batch_size

            v[:] = beta1 * v + (1. - beta1) * g
            sqr[:] = beta2 * sqr + (1. - beta2) * nd.square(g)

            v_bias_corr = v / (1. - beta1 ** t)
            sqr_bias_corr = sqr / (1. - beta2 ** t)

            div = lr * v_bias_corr / (nd.sqrt(sqr_bias_corr) + eps_stable)
            param[:] = param - div
    def select_action(self, state):
        with autograd.record():
            mu, sigma_sq = self.model(state.as_in_context(model_ctx))
            # sigma_sq = nd.softrelu(sigma_sq)
            # the implementation of softplus
            sigma_sq = nd.log(1 + nd.exp(sigma_sq))

            eps = nd.random.normal(0, 1, mu.shape, dtype=np.float32)
            # calculate the probability
            action = mu + nd.sqrt(sigma_sq) * eps
            prob = normal(action, mu, sigma_sq)

            entropy = -0.5 * (np.log(sigma_sq + math.pi * 2) + 1)
            log_prob = nd.log(prob)

        return action, log_prob, entropy
Esempio n. 33
0
def pure_batch_norm(X, gamma, beta, eps=1e-5):
    assert len(X.shape) in (2, 4)
    # 全连接: batch_size x feature
    if len(X.shape) == 2:
        # 每个输入维度在样本上的平均和方差
        mean = X.mean(axis=0, keepdims=True)
        variance = ((X - mean)**2).mean(axis=0, keepdims=True)
    # 2D卷积: batch_size x channel x height x width
    else:
        # 对每个通道算均值和方差,需要保持4D形状使得可以正确地广播
        mean = X.mean(axis=(0, 2, 3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)

    # 均一化
    X_hat = (X - mean) / nd.sqrt(variance + eps)
    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Esempio n. 34
0
def get_distance_matrix(x):
    """Get distance matrix given a matrix. Used in testing."""
    square = nd.sum(x ** 2.0, axis=1, keepdims=True)
    distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose()))
    return nd.sqrt(distance_square)