def compute_retrospective_loss(self, observed_arr, encoded_arr,
                                   decoded_arr, re_encoded_arr):
        '''
        Compute retrospective loss.

        Returns:
            The tuple data.
            - `np.ndarray` of delta.
            - `np.ndarray` of losses of each batch.
            - float of loss of all batch.

        '''
        if self.__output_neuron_count == self.__hidden_neuron_count:
            target_arr = nd.broadcast_sub(
                encoded_arr, nd.expand_dims(observed_arr.mean(axis=2), axis=2))
            summary_delta_arr = nd.sqrt(nd.power(decoded_arr - target_arr, 2))
        else:
            # For each batch, draw a samples from the Uniform distribution.
            if self.__output_neuron_count > self.__hidden_neuron_count:
                all_dim_arr = np.arange(self.__output_neuron_count)
                np.random.shuffle(all_dim_arr)
                choiced_dim_arr = all_dim_arr[:self.__hidden_neuron_count]
                target_arr = nd.broadcast_sub(
                    encoded_arr,
                    nd.expand_dims(observed_arr[:, :,
                                                choiced_dim_arr].mean(axis=2),
                                   axis=2))
                summary_delta_arr = nd.sqrt(
                    nd.power(decoded_arr[:, :, choiced_dim_arr] - target_arr,
                             2))
            else:
                all_dim_arr = np.arange(self.__hidden_neuron_count)
                np.random.shuffle(all_dim_arr)
                choiced_dim_arr = all_dim_arr[:self.__output_neuron_count]
                target_arr = nd.broadcast_sub(
                    encoded_arr[:, :, choiced_dim_arr],
                    nd.expand_dims(observed_arr.mean(axis=2), axis=2))
                summary_delta_arr = nd.sqrt(
                    nd.power(decoded_arr - target_arr, 2))

        match_delta_arr = None
        for i in range(self.__batch_size):
            arr = nd.sqrt(
                nd.power(encoded_arr[i, -1] - re_encoded_arr[i, -1], 2))
            if match_delta_arr is None:
                match_delta_arr = nd.expand_dims(arr, axis=0)
            else:
                match_delta_arr = nd.concat(match_delta_arr,
                                            nd.expand_dims(arr, axis=0),
                                            dim=0)

        delta_arr = summary_delta_arr + nd.expand_dims(
            self.__retrospective_lambda * match_delta_arr, axis=1)
        v = nd.norm(delta_arr)
        if v > self.__grad_clip_threshold:
            delta_arr = delta_arr * self.__grad_clip_threshold / v

        loss = nd.mean(delta_arr, axis=0, exclude=True)

        return loss
def batch_norm(X,
               gamma,
               beta,
               is_training,
               moving_mean,
               moving_variance,
               eps=1e-5,
               moving_momentum=0.9):
    assert len(X.shape) in (2, 4)

    if len(X.shape) == 2:
        mean = X.mean(axis=0)
        variance = ((X - mean)**2).mean(axis=0)

    else:
        mean = X.mean(axis=(0, 2, 3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0, 2, 3), keepdims=True)

        # 变形使得可以正确广播
        moving_mean = moving_mean.reshape(mean.reshape)
        moving_variance = moving_variance.reshape(mean.shape)

    if is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)

        # 更新全局的均值方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * variance

    else:
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Beispiel #3
0
def batch_norm(X, gamma, beta, is_training, moving_mean, moving_variance,
               eps = 1e-5, moving_momentum = 0.9):
    assert len(X.shape) in (2, 4)
    # 全连接: batch_size x feature
    if len(X.shape) == 2:
        # 每个输入维度在样本上的平均和方差
        mean = X.mean(axis=0)
        variance = ((X - mean)**2).mean(axis=0)
    # 2D卷积: batch_size x channel x height x width
    else:
        # 对每个通道算均值和方差,需要保持4D形状使得可以正确的广播
        mean = X.mean(axis=(0,2,3), keepdims=True)
        variance = ((X - mean)**2).mean(axis=(0,2,3), keepdims=True)
        # 变形使得可以正确的广播
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)

    # 均一化
    if not is_training:
        X_hat = (X - mean) / nd.sqrt(variance + eps)
        #!!! 更新全局的均值和方差
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        #!!! 测试阶段使用全局的均值和方差
        X_hat = (X - moving_mean) / nd.sqrt(moving_variance + eps)

    # 拉升和偏移
    return gamma.reshape(mean.shape) * X_hat + beta.reshape(mean.shape)
Beispiel #4
0
def adadelta(params, sqrs, deltas, roh, batch_size):
    eps_stable = 1e-5
    for param, sqr, delta in zip(params, sqrs, deltas):
        g = param.grad / batch_size
        sqr[:] = roh * sqr + (1. - roh) * nd.square(g)
        g_next = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g
        delta[:] = roh * delta + (1. - roh) * g_next * g_next
        param[:] -= g_next
Beispiel #5
0
def adadelta(params, sqrs, deltas, batch_size, rho):
    eps_stable = 1e-5
    for param, sqr, delta in zip(params, sqrs, deltas):
        g = param.grad / batch_size
        sqr[:] = rho * sqr + (1. - rho) * nd.square(g)
        cur_delta = nd.sqrt(delta + eps_stable) / nd.sqrt(sqr + eps_stable) * g
        delta[:] = rho * delta + (1. - rho) * cur_delta * cur_delta
        param[:] -= cur_delta
Beispiel #6
0
    def forward(self, X1, X2):
        X1 = self.bimp(X1)
        X2 = self.bimp(X2)

        X1_norm = nd.sqrt(nd.sum(X1 * X1, axis=-1) + 1e-12)
        X2_norm = nd.sqrt(nd.sum(X2 * X2, axis=-1) + 1e-12)

        distance_cos = 1 - nd.sum(X1 * X2,
                                  axis=-1) / (X1_norm * X2_norm + 1e-12)

        return distance_cos
Beispiel #7
0
    def update(self, index, weight, grad, state):
        assert (isinstance(weight, NDArray))
        assert (isinstance(grad, NDArray))
        self._update_count(index)
        lr = self._get_lr(index)
        wd = self._get_wd(index)
        t = self._index_update_count[index]

        with bulk(self._bulk):
            # preprocess grad
            grad *= self.rescale_grad
            if self.clip_gradient is not None:
                grad = clip(grad, -self.clip_gradient, self.clip_gradient)

            mean, var = state
            mean *= self.beta1
            mean += (1. - self.beta1) * grad
            var *= self.beta2
            var += (1. - self.beta2) * square(grad)

            r1 = weight.norm()
            if not self.bias_correction:
                r1 = minimum(maximum(r1, self.lower_bound), self.upper_bound)
                sqrt_var = sqrt(var)
                sqrt_var += self.epsilon
                g = mean / sqrt_var
                g += wd * weight
            else:
                # apply bias correction
                mean_hat = mean / (1. - power(self.beta1, t))
                var_hat = var / (1. - power(self.beta2, t))
                if self._eps_after_sqrt:
                    sqrt(var_hat, out=var_hat)
                    var_hat += self.epsilon
                else:
                    var_hat += self.epsilon
                    sqrt(var_hat, out=var_hat)
                mean_hat /= var_hat
                mean_hat += wd * weight
                g = mean_hat

            r2 = g.norm()

            # calculate lamb_trust_ratio
            ratio = r1 / r2
            # becomes NaN if ratio == NaN or 0, otherwise 0
            nan_or_zero = 1 - ratio / ratio
            r = where(nan_or_zero, ones_like(ratio), ratio)
            lr *= r

            # update weight
            g *= lr
            weight[:] -= g
Beispiel #8
0
def verify_instance_norm_rewrite(shp, eps):
    # assert len(shp) == 4 # NCHW
    assert len(shp) >= 3
    vshp = (shp[1], )
    data_np = np.random.uniform(size=shp)
    gamma_np = np.random.uniform(size=vshp)
    beta_np = np.random.uniform(size=vshp)
    x = nd.array(data_np)
    gamma = nd.array(gamma_np)
    beta = nd.array(beta_np)

    # org op
    y = nd.InstanceNorm(x, gamma=gamma, beta=beta, eps=eps)

    # rewrite op
    axis = [i for i in range(len(shp)) if i != 1]
    for i in axis:
        gamma = nd.expand_dims(gamma, axis=i)
        beta = nd.expand_dims(beta, axis=i)

    n = np.product(shp[2:])
    mean = nd.sum(x, axis=axis, keepdims=True) / n
    dev = x - mean
    var = nd.sum(dev * dev, axis=axis, keepdims=True) / n
    std = nd.sqrt(var) + eps
    frac = dev / std
    z = frac * gamma + beta

    # compare
    assert z.shape == y.shape
    zn, zp = get_norm(z)
    yn, yp = get_norm(y)
    rn = np.linalg.norm(zp - yp)
    print(zn, yn, rn)
 def goodness_of_function_optimizer_function(self):
     for param, sqr in zip(self.__params, self.__sqrs):
         g = param.grad / self.__batch_size
         # 注意 这里不是 +=
         sqr[:] = self.__gamma * sqr + (1. - self.__gamma) * nd.square(g)
         div = self.__learning_rate * g / nd.sqrt(sqr + self.__eps_stable)
         param[:] -= div
Beispiel #10
0
def TimeseriesFromPSD_nd(param_noise):
    """
    GPU only
    """
    (asd_pos, asd_neg, low_f, high_f, high_f_, size, fs, fmin, fmax) = param_noise
    (*D_, N) = size
    D = reduce(lambda x, y: x * y, D_)
    # Gauss noise and its one-sided PSD without window
    gauss_noise = 1* nd.random_normal(loc=0,scale=64,shape=(D, N), ctx=ctx)
    _, xf_noise, psd_gauss = oneSidedPeriodogram_nd(gauss_noise, fs=8192)
    psd_gauss = nd.array(psd_gauss, ctx = ctx, dtype='float64')

    psd_twosided  = nd.concat(  # low positive
                              nd.zeros((D, low_f), ctx = ctx, dtype='float64'), 
                                # high positive
                              psd_gauss[:, low_f:high_f] * asd_pos, 
                              nd.zeros((D, high_f_), ctx = ctx, dtype='float64'),
                              nd.zeros((D, high_f_), ctx = ctx, dtype='float64'),
                                # high negative
                              psd_gauss[:, low_f:high_f][::-1] * asd_neg, 
                                # low negative
                              nd.zeros((D, low_f), ctx = ctx, dtype='float64'), dim=1)
    amplitude =  nd.sqrt(psd_twosided *2 *fs*N )
    epsilon_imag = nd.random_uniform(low=0, high=1, shape=(D,N),ctx=ctx,dtype='float64')*2*np.pi
    re = nd.cos(epsilon_imag)*amplitude
    im = nd.sin(epsilon_imag)*amplitude
    temp = nd.zeros((D, N*2) , ctx=ctx)
    temp[:,::2] = re
    temp[:,1::2] = im
    timeseries = mx.contrib.ndarray.ifft(temp)/N
    return timeseries.reshape(size),  psd_twosided
def merge(conv_w, gamma, beta, running_mean, running_var):
    gamma_over_var = gamma / nd.sqrt(running_var + 1e-5)
    gamma_over_var_expanded = nd.reshape(gamma_over_var,
                                         (gamma_over_var.shape[0], 1, 1, 1))
    new_w = gamma_over_var_expanded * nd.cast(conv_w, 'float32')
    new_b = beta - running_mean * gamma_over_var
    return new_w, new_b
Beispiel #12
0
def adagrad(params, sqrs, lr, batch_size):
    eps_stable = 1e-7
    for param, sqr in zip(params, sqrs):
        g = param.grad / batch_size
        sqr[:] += nd.square(g)
        div = lr * g / nd.sqrt(sqr + eps_stable)
        param[:] -= div
def verify_l2normalization_rewrite(shape, eps, mode):
    assert len(shape) == 4  # NCHW
    data_np = np.random.uniform(size=shape)
    x = nd.array(data_np)

    # org op
    y = nd.L2Normalization(x, eps=eps, mode=mode)

    # rewrite op
    z = nd.broadcast_mul(x, x)
    if mode == "channel":
        axis = [1]
    elif mode == "instance":
        axis = [1, 2, 3]
    elif mode == "spatial":
        axis = [2, 3]
    else:
        assert "not valid `mode` type: %s" % mode
    z = nd.sum(z, axis=axis)
    eps_tensor = nd.array([eps])
    z = nd.broadcast_add(z, eps_tensor)
    z = nd.sqrt(z)
    for i in axis:
        z = nd.expand_dims(z, axis=i)
        z = nd.repeat(z, repeats=shape[i], axis=i)
    z = nd.broadcast_div(x, z)
    print(z.shape)
    return

    # compare
    assert z.shape == y.shape
    zn, zp = get_norm(z)
    yn, yp = get_norm(y)
    rn = np.linalg.norm(zp - yp)
    print(zn, yn, rn)
Beispiel #14
0
def rmsprop(params, sqrs, lr, gamma, batch_size):
    eps_stable = 1e-8
    for param, sqr in zip(params, sqrs):
        g = param.grad / batch_size
        sqr[:] = gamma * sqr + (1. - gamma) * nd.square(g)
        div = lr * g / nd.sqrt(sqr + eps_stable)
        param[:] -= div
Beispiel #15
0
def adagrad(params, sqrs, lr, batch_size):
    eps_stable = 1e-7
    for param, sqr in zip(params, sqrs):
        g = param.grad / batch_size
        sqr[:] = sqr + nd.square(g)
        div = lr * g / (nd.sqrt(eps_stable + sqr))
        param[:] -= div
    def update(self):
        self.state_step += 1
        for idx, data in self.trace:
            grad = data.grad

            clr = self.args.lr
            # clr = self.args.lr / (1 + (self.state_step - 1) * group['lr_decay'])

            # the update is non-linear so indices must be unique
            grad_indices = idx
            grad_values = grad

            grad_sum = (grad_values * grad_values).mean(1)
            ctx = self.state_sum.context
            if ctx != grad_indices.context:
                grad_indices = grad_indices.as_in_context(ctx)
            if ctx != grad_sum.context:
                grad_sum = grad_sum.as_in_context(ctx)
            self.state_sum[grad_indices] += grad_sum
            std = self.state_sum[grad_indices]  # _sparse_mask
            std_values = nd.expand_dims(nd.sqrt(std) + 1e-10, 1)
            if self.gpu >= 0:
                std_values = std_values.as_in_context(mx.gpu(self.args.gpu))
            tmp = -clr * grad_values / std_values
            if tmp.context != ctx:
                tmp = tmp.as_in_context(ctx)
            # TODO(zhengda) the overhead is here.
            self.emb[grad_indices] = mx.nd.take(self.emb, grad_indices) + tmp
        self.trace = []
Beispiel #17
0
    def update(self, index, weight, grad, state):
        assert(isinstance(weight, NDArray))
        assert(isinstance(grad, NDArray))
        self._update_count(index)
        lr = self._get_lr(index)
        wd = self._get_wd(index)

        t = self._index_update_count[index]

        # preprocess grad
        #grad = grad * self.rescale_grad + wd * weight
        grad *= self.rescale_grad + wd * weight
        if self.clip_gradient is not None:
            grad = clip(grad, -self.clip_gradient, self.clip_gradient)

        # warming momentum schedule
        momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
        momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
        self.m_schedule = self.m_schedule * momentum_t
        m_schedule_next = self.m_schedule * momentum_t_1

        # update m_t and v_t
        m_t, v_t = state
        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
        v_t[:] = self.beta2 * v_t + (1. - self.beta2) * grad * grad

        grad_prime = grad / (1. - self.m_schedule)
        m_t_prime = m_t / (1. - m_schedule_next)
        v_t_prime = v_t / (1. - pow(self.beta2, t))
        m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime

        # update weight
        weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
    def update(self, index, weight, grad, state):
        assert (isinstance(weight, NDArray))
        assert (isinstance(grad, NDArray))
        self._update_count(index)
        lr = self._get_lr(index)
        wd = self._get_wd(index)

        is_sparse = grad.stype == 'row_sparse'
        history = state

        if is_sparse:
            kwargs = {
                'epsilon': self.float_stable_eps,
                'rescale_grad': self.rescale_grad
            }
            if self.clip_gradient:
                kwargs['clip_gradient'] = self.clip_gradient
            sparse.adaalter_update(weight,
                                   grad,
                                   history,
                                   out=weight,
                                   lr=lr,
                                   wd=wd,
                                   **kwargs)
            # raise NotImplementedError('AdaAlter has not been implemented for sparse nd')
        else:
            grad = grad * self.rescale_grad
            if self.clip_gradient is not None:
                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
            div = grad / sqrt(history + self.float_stable_eps)
            weight[:] += (div + weight * wd) * -lr

            history[:] += square(grad)
def verify_l2normalization_rewrite_tile(shape, eps, mode):
    assert len(shape) == 4  # NCHW
    data_np = np.random.uniform(size=shape)
    x = nd.array(data_np)

    # org op
    y = nd.L2Normalization(x, eps, mode=mode)

    # rewrite op
    z = nd.broadcast_mul(x, x)
    if mode == "channel":
        axis = [1]
    elif mode == "instance":
        axis = [1, 2, 3]
    elif mode == "spatial":
        axis = [2, 3]
    else:
        assert "not valid `mode` type: %s" % mode
    reps = tuple(
        [shp if i in axis else 1 for i, shp in enumerate(list(shape))])
    z = nd.sum(z, axis=axis, keepdims=True)
    eps_tensor = nd.array([eps])
    z = nd.sqrt(z)
    z = nd.tile(z, reps=reps)
    z = nd.broadcast_div(x, z)

    # compare
    assert z.shape == y.shape
    zn, zp = get_norm(z)
    yn, yp = get_norm(y)
    rn = np.linalg.norm(zp - yp)
    print(zn, yn, rn)
Beispiel #20
0
def grad_clipping(params, theta):
    norm = nd.array([0.0], ctx)
    for p in params:
        norm += nd.sum(p.grad**2)
    norm = nd.sqrt(norm).asscalar()
    if norm > theta:
        for p in params:
            p.grad[:] *= theta / norm
Beispiel #21
0
def get_global_norm(arrays):
    ctx = arrays[0].context
    total_norm = nd.add_n(*[
        nd.dot(x, x).as_in_context(ctx)
        for x in (arr.reshape((-1, )) for arr in arrays)
    ])
    total_norm = nd.sqrt(total_norm).asscalar()
    return total_norm
Beispiel #22
0
    def hybrid_forward(self, F, pred, label, sample_weight=None):

        #label = _reshape_like(F, label, pred)
        #loss = F.square(pred-label)
        #loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
        loss = F.sqrt(F.square(pred - label))
        #return F.mean(loss, axis=self._batch_axis, exclude=True)
        return loss
Beispiel #23
0
def adam(params, vs, sqrs, lr, batch_size, t):
    for param, v, sqr in zip(params, vs, sqrs):
        current_v = beta1 * v + (1 - beta1) * param.grad
        current_sqr = beta2 * sqr + (1 - beta2) * param.grad * param.grad
        v[:] = current_v / (1 - beta1**t)
        sqr[:] = current_sqr / (1 - beta2**t)
        grad = v / nd.sqrt(sqr + eps_stable)
        param[:] = param - (lr / batch_size) * grad
Beispiel #24
0
def grad_clipping(params, theta, ctx):
    if theta is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad * p.grad)
        norm = nd.sqrt(norm).asscalar()
        if norm > theta:
            for p in params:
                p.grad[:] *= theta / norm
Beispiel #25
0
def batched_l2_dist(a, b):
    a_squared = nd.power(nd.norm(a, axis=-1), 2)
    b_squared = nd.power(nd.norm(b, axis=-1), 2)

    squared_res = nd.add(nd.linalg_gemm(
        a, nd.transpose(b, axes=(0, 2, 1)), nd.broadcast_axes(nd.expand_dims(b_squared, axis=-2), axis=1, size=a.shape[1]), alpha=-2
    ), nd.expand_dims(a_squared, axis=-1))
    res = nd.sqrt(nd.clip(squared_res, 1e-30, np.finfo(np.float32).max))
    return res
Beispiel #26
0
def grad_clipping(params, theta, ctx):
    if theta is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad * p.grad)
        norm = nd.sqrt(norm).asscalar()
        if norm > theta:
            for p in params:
                p.grad[:] *= theta / norm
Beispiel #27
0
 def goodness_of_function_optimizer_function(self):
     for param, v, sqr in zip(self.__params, self.__vs, self.__sqrs):
         g = param.grad / self.__batch_size
         v[:] = self.__beta1 * v + (1 - self.__beta1) * g
         sqr[:] = self.__beta2 * sqr + (1 - self.__beta2) * nd.square(g)
         v_hat = v / (1 - self.__beta1**self.__t)
         sqr_hat = sqr / (1 - self.__beta2**self.__t)
         div = self.__learning_rate * v_hat / nd.sqrt(sqr_hat +
                                                      self.__eps_stable)
         param[:] -= div
Beispiel #28
0
def grad_clipping(params, clipping_norm, ctx):
    """Gradient clipping."""
    if clipping_norm is not None:
        norm = nd.array([0.0], ctx)
        for p in params:
            norm += nd.sum(p.grad ** 2)
        norm = nd.sqrt(norm).asscalar()
        if norm > clipping_norm:
            for p in params:
                p.grad[:] *= clipping_norm / norm
Beispiel #29
0
def gradient_clipping(parameters, threshold, ctx):
    if threshold is not None:
        norm = nd.array([0.0], ctx)

        for parameter in parameters:
            norm += nd.sum(parameter.grad ** 2)
        norm = nd.sqrt(norm).asscalar()

        if norm > threshold:
            for parameter in parameters:
                parameter.grad[:] *= (threshold / norm)
Beispiel #30
0
def adam(params, lr, vals, sqrs, iter, batch_size, beta1=0.9, beta2=0.999):
    eps_stable = 1e-8
    for param, val, sqr in zip(params, vals, sqrs):
        g = param.grad / batch_size
        val[:] = beta1 * val + (1 - beta1) * g
        sqr[:] = beta2 * sqr + (1 - beta2) * nd.square(g)
        #val_next = val / (1 - nd.power(beta1, iter))
        val_next = val / (1. - beta1**iter)
        #sqr_next = sqr / (1. - nd.power(beta2, iter))
        sqr_next = sqr / (1. - beta2**iter)
        g_next = lr * val_next / (nd.sqrt(sqr_next) + eps_stable)
        param[:] -= g_next
def _ratio_enum(anchor, ratios):
    """
    Enumerate a set of anchors for each aspect ratio wrt an anchor.
    """

    w, h, x_ctr, y_ctr = _whctrs(anchor)
    size = w * h
    size_ratios = size / ratios
    ws = nd.round(nd.sqrt(size_ratios))
    hs = nd.round(ws * ratios)
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors
Beispiel #32
0
def grad_clipping(params,theta,ctx):
    if theta is not None:
        norm = nd.array([0.0],ctx)
        for p in params:
            # print('grad_clipping:grad=',p.grad)
            norm += nd.sum(p.grad ** 2)
            # print('norm:',norm)
        norm = nd.sqrt(norm).asscalar()
        # print('grad_clipoing:norm=%f,theta=%f' %(norm,theta))
        if norm > theta:
            for p in params:
                p.grad[:] *= theta / norm