def weight_normalization_backward(inputs, dim=0, eps=1e-12): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] w = inputs[1] g = inputs[2] g_shape = g.shape dim += w.ndim*(dim < 0) # Create inverted norm of w sum_axes = list(filter(lambda x: x != dim, range(w.ndim))) w_pow = F.pow_scalar(w, 2.0) w_sum = F.sum(w_pow, sum_axes, True) w_add = F.add_scalar(w_sum, eps) w_norm_inv = F.pow_scalar(w_add, -0.5) dyw_sum = F.sum(dy * w, sum_axes, True) # w.r.t. dw g = g.reshape([s if i == dim else 1 for i, s in enumerate(w.shape)]) dw = (dy - dyw_sum * (w_norm_inv ** 2) * w) * g * w_norm_inv # w.r.t. dg dg = dyw_sum * w_norm_inv dg = dg.reshape(g_shape) return dw, dg
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Inputs x0 = inputs[0].data dy = inputs[1].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_dy = inputs[1].grad # Grads of outputs g_dx0 = outputs[0].grad # Computation if prop_down[0]: if accum[0]: g_x0 -= g_dx0 * dy * F.pow_scalar(x0, -2.0) else: g_x0.copy_from(-g_dx0 * dy * F.pow_scalar(x0, -2.0)) if prop_down[1]: inp = nn.Variable(x0.shape).apply(data=x0, grad=g_dy, need_grad=True) out = nn.Variable(dy.shape).apply(grad=g_dx0) self.forward_func.backward([inp], [out], accum=[accum[1]])
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # Inputs y0 = inputs[0].data dz = inputs[2].data # Outputs dy0 = outputs[0].data # Grads of inputs g_y0 = inputs[0].grad g_dz = inputs[2].grad # Grads of outputs g_dy0 = outputs[0].grad # Computation if prop_down[0]: if accum[0]: g_y0 += g_dy0 * dz * F.pow_scalar(y0, -2.0) else: g_y0.copy_from(g_dy0 * dz * F.pow_scalar(y0, -2.0)) if prop_down[2]: if accum[2]: g_dz -= F.sum(g_dy0 * F.pow_scalar(y0, -1.0), axis, True) else: g_dz.copy_from(-F.sum(g_dy0 * F.pow_scalar(y0, -1.0), axis, True))
def f_layer_normalization(inp, beta, gamma): use_axis = [x for x in range(1, inp.ndim)] inp = F.sub2(inp, F.mean(inp, axis=use_axis, keepdims=True)) inp = F.div2( inp, F.pow_scalar( F.mean(F.pow_scalar(inp, 2), axis=use_axis, keepdims=True), 0.5)) return inp * F.broadcast(gamma, inp.shape) + F.broadcast(beta, inp.shape)
def distance(u, v, eps=1e-5): uu = F.sum(F.pow_scalar(u, 2), axis=1) vv = F.sum(F.pow_scalar(v, 2), axis=1) euclid_norm_pow2 = F.sum(F.pow_scalar(u - v, 2), axis=1) alpha = F.maximum2(F.constant(eps, shape=uu.shape), 1.0 - uu) beta = F.maximum2(F.constant(eps, shape=vv.shape), 1.0 - vv) return F.acosh(1 + 2 * euclid_norm_pow2 / (alpha * beta))
def sigmas_regularization(ctx, log_var0, log_var1): with nn.context_scope(ctx): h0 = F.exp(log_var0) h0 = F.pow_scalar(h0, 0.5) h1 = F.exp(log_var1) h1 = F.pow_scalar(h1, 0.5) r = F.mean(F.squared_error(h0, h1)) return r
def sigmas_regularization(ctx, log_var0, log_var1): with nn.context_scope(ctx): h0 = F.exp(log_var0) h0 = F.pow_scalar(h0, 0.5) h1 = F.exp(log_var1) h1 = F.pow_scalar(h1, 0.5) r = F.mean(F.squared_error(h0, h1)) return r
def minibatch_stddev(x, eps=1e-8): b, _, h, w = x.shape mean = F.mean(x, axis=0, keepdims=True) std = F.pow_scalar( F.mean(F.pow_scalar(F.sub2(x, F.broadcast(mean, x.shape)), 2.), axis=0, keepdims=True) + eps, 0.5) std_chanel = F.broadcast(F.mean(std, keepdims=True), (b, 1, h, w)) x = F.concatenate(x, std_chanel, axis=1) return x
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): var0 = F.exp(log_var0) var1 = F.exp(log_var1) s0 = F.pow_scalar(var0, 0.5) s1 = F.pow_scalar(var0, 0.5) squared_error = F.squared_error(pred0, pred1) with nn.context_scope(ctx): loss = F.log(s1/s0) + (var0/var1 + squared_error/var1) * 0.5 loss_sr = F.mean(loss) return loss_sr
def lsgan_loss(d_fake, d_real=None, persistent=True): if d_real: # Discriminator loss loss_d_real = F.mean(F.pow_scalar(d_real - 1., 2.)) loss_d_fake = F.mean(F.pow_scalar(d_fake, 2.)) loss = (loss_d_real + loss_d_fake) * 0.5 loss.persistent = persistent return loss else: # Generator loss, this form leads to minimization loss = F.mean(F.pow_scalar(d_fake - 1., 2.)) loss.persistent = persistent return loss
def compute_mel(self, wave): hp = self.hparams reals, imags = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = F.pow_scalar( F.add2(F.pow_scalar(reals, 2), F.pow_scalar(imags, 2)), 0.5) mels = F.batch_matmul(self.basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)).apply(need_grad=False) return mels
def __call__(self, gen_rgb_out): out = conv_layer(gen_rgb_out, inmaps=3, outmaps=self.channels[0], kernel_size=1, name_scope='Discriminator/Convinitial') inmaps = self.channels[0] for i in range(1, len(self.resolutions)): res = out.shape[2] outmaps = self.channels[i] out = res_block(out, res=res, outmaps=outmaps, inmaps=inmaps) inmaps = outmaps N, C, H, W = out.shape group = min(N, self.stddev_group) stddev_mean = F.reshape( out, (group, -1, self.stddev_feat, C // self.stddev_feat, H, W), inplace=False) # mean = F.mean(stddev_mean, axis=0, keepdims=True) mean = F.mul_scalar(F.sum(stddev_mean, axis=0, keepdims=True), 1.0/stddev_mean.shape[0], inplace=False) stddev_mean = F.mean(F.pow_scalar(F.sub2(stddev_mean, F.broadcast( mean, stddev_mean.shape)), 2.), axis=0, keepdims=False) stddev_mean = F.pow_scalar(F.add_scalar( stddev_mean, 1e-8, inplace=False), 0.5, inplace=False) stddev_mean = F.mean(stddev_mean, axis=[2, 3, 4], keepdims=True) stddev_mean = F.reshape( stddev_mean, stddev_mean.shape[:2]+stddev_mean.shape[3:], inplace=False) out = F.concatenate(out, F.tile(stddev_mean, (group, 1, H, W)), axis=1) out = conv_layer(out, inmaps=out.shape[1], outmaps=self.channels[-1], kernel_size=3, name_scope='Discriminator/Convfinal') out = F.reshape(out, (N, -1), inplace=False) # Linear Layers lrmul = 1 scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], self.channels[-1]), weight_var='Discriminator/final_linear_1/affine') out = F.affine(out, W*scale, bias*lrmul) out = F.mul_scalar(F.leaky_relu( out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) scale = 1/(out.shape[1]**0.5)*lrmul W, bias = weight_init_fn( (out.shape[-1], 1), weight_var='Discriminator/final_linear_2/affine') out = F.affine(out, W*scale, bias*lrmul) return out
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) h = F.mean(h, axis=1) r = F.mean(F.squared_error(h, one)) return r
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) h = F.mean(h, axis=1) r = F.mean(F.squared_error(h, one)) return r
def sample_noise(inpt_size, out_size): _f = lambda x: F.sign(x) * F.pow_scalar(F.abs(x), 0.5) noise = _f(F.randn(shape=(inpt_size + out_size, ))) eps_w = F.batch_matmul(F.reshape(noise[:inpt_size], (1, -1)), F.reshape(noise[inpt_size:], (1, -1)), True) eps_b = noise[inpt_size:] return eps_w, eps_b
def norm_backward(inputs, p=None, axes=None, keep_dims=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # Add axis for mul2 if not keep_dims: shape = list(x0.shape) for a in axes: shape[a] = 1 dy = dy.reshape(shape) x_sign = no_grad(F.sign(x0)) dx = dy * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def ce_loss_with_uncertainty(ctx, pred, y_l, log_var): r = F.randn(0., 1., log_var.shape) r = F.pow_scalar(F.exp(log_var), 0.5) * r h = pred + r with nn.context_scope(ctx): loss_ce = F.mean(F.softmax_cross_entropy(h, y_l)) return loss_ce
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) b = log_var.shape[0] r = F.sum(F.squared_error(h, one)) / b return r
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def ce_loss_with_uncertainty(ctx, pred, y_l, log_var): r = F.randn(0., 1., log_var.shape) r = F.pow_scalar(F.exp(log_var), 0.5) * r h = pred + r with nn.context_scope(ctx): loss_ce = F.mean(F.softmax_cross_entropy(h, y_l)) return loss_ce
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) b = log_var.shape[0] r = F.sum(F.squared_error(h, one)) / b return r
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def norm_normalization_backward(inputs, p=None, axes=None, eps=1e-12): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # x_norm = x_sum ** (1./p) # Div2 backward dx = dy * x_sum**(-1. / p) dx_norm = -dy * x0 * x_sum**(-2. / p) dx_norm = sum_for_arithmetics(dx_norm, x_sum) # Norm backward x_sign = no_grad(F.sign(x0)) dx += dx_norm * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def lsgan_loss(real, weight, fake=None): if fake: loss = weight * F.mean(F.squared_error(F.constant(1, real.shape), real) + F.pow_scalar(fake, 2)) else: loss = weight * \ F.mean(F.squared_error(F.constant(1, real.shape), real)) return loss
def lsgan_loss(feat, target_is_real=True, persistent=True): if target_is_real: label = F.constant(1, shape=feat.shape) else: label = F.constant(0, shape=feat.shape) loss = F.mean(F.pow_scalar(feat - label, 2.0)) loss.persistent = persistent return loss
def regularize_noise(self, noises): loss = 0 for noise in noises: size = noise.shape[2] while True: loss = (loss + F.pow_scalar( F.mean(noise * F.shift( noise, shifts=(0, 0, 0, 1), border_mode='reflect')), 2) + F.pow_scalar( F.mean(noise * F.shift(noise, shifts=(0, 0, 1, 0), border_mode='reflect')), 2)) if size <= 8: break noise = F.reshape(noise, [-1, 1, size // 2, 2, size // 2, 2]) noise = F.mean(noise, [3, 5]) size //= 2 return loss
def spectral_normalization_for_affine(w, itr=1, eps=1e-12, input_axis=1, test=False): W_sn = get_parameter_or_create("W_sn", w.shape, ConstantInitializer(0), False) if test: return W_sn d0 = np.prod(w.shape[0:-1]) # In d1 = np.prod(w.shape[-1]) # Out u0 = get_parameter_or_create("singular-vector", [d1], NormalInitializer(), False) u = F.reshape(u0, [d1, 1]) # Power method for _ in range(itr): # v v = F.affine(w, u) v = F.div2( v, F.pow_scalar(F.sum(F.pow_scalar(v, 2.), keepdims=True) + eps, 0.5)) v = F.reshape(v, [1, d0]) # u u = F.affine(v, w) u = F.div2( u, F.pow_scalar(F.sum(F.pow_scalar(u, 2.), keepdims=True) + eps, 0.5)) u = F.reshape(u, [d1, 1]) # Iterate u = F.identity(u, outputs=[u0.data]) u.persistent = True # No grad u.need_grad = False v.need_grad = False # Spectral normalization wv = F.affine(v, w) sigma = F.affine(wv, u) sigma = F.broadcast(F.reshape(sigma, [1 for _ in range(len(w.shape))]), w.shape) w_sn = F.div2(w, sigma, outputs=[W_sn.data]) w_sn.persistent = True return w_sn
def gen_path_regularize(fake_img, latents, mean_path_length, decay=0.01, pl_weight=2.0): noise = F.randn(shape=fake_img.shape) / \ np.sqrt(fake_img.shape[2]*fake_img.shape[3]) gradient = nn.grad([F.sum(fake_img * noise)], [latents])[0] path_lengths = F.mean(F.sum(F.pow_scalar(gradient, 2), axis=1), axis=0) path_lengths = F.pow_scalar(path_lengths, 0.5) path_mean = mean_path_length + decay * \ (F.mean(path_lengths) - mean_path_length) path_penalty = F.mean( F.pow_scalar(path_lengths - F.reshape(path_mean, (1, ), inplace=False), 1)) return path_penalty * pl_weight, path_mean, path_lengths
def spectral_normalization_for_conv(w, itr=1, eps=1e-12, test=False): w_shape = w.shape W_sn = get_parameter_or_create("W_sn", w_shape, ConstantInitializer(0), False) if test: return W_sn d0 = w.shape[0] # Out d1 = np.prod(w.shape[1:]) # In w = F.reshape(w, [d0, d1], inplace=False) u0 = get_parameter_or_create("singular-vector", [d0], NormalInitializer(), False) u = F.reshape(u0, [1, d0]) # Power method for _ in range(itr): # v v = F.affine(u, w) v = F.div2( v, F.pow_scalar(F.sum(F.pow_scalar(v, 2.), keepdims=True) + eps, 0.5)) v = F.reshape(v, [d1, 1]) # u u = F.affine(w, v) u = F.div2( u, F.pow_scalar(F.sum(F.pow_scalar(u, 2.), keepdims=True) + eps, 0.5)) u = F.reshape(u, [1, d0]) # Iterate u = F.identity(u, outputs=[u0.data]) u.persistent = True # No grad u.need_grad = False v.need_grad = False # Spectral normalization wv = F.affine(w, v) sigma = F.affine(u, wv) w_sn = F.div2(w, sigma) w_sn = F.reshape(w_sn, w_shape) w_sn = F.identity(w_sn, outputs=[W_sn.data]) w_sn.persistent = True return w_sn
def __init__(self, waveglow, hp): mel_input = F.constant(shape=[1, hp.n_mels, 88]) wave = waveglow.infer(mel_input, sigma=0) real, imag = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) bias_spec = F.pow_scalar(real**2 + imag**2, 0.5) bias_spec.forward(clear_buffer=True) self.bias_spec = bias_spec.d.copy()[:, :, 0][0, :, None] self.hparams = hp
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Inputs x0 = inputs[0].data x1 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data dx1 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_x1 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dx1 = outputs[1].grad # Computation x1_inv_square = F.pow_scalar(x1, -2.0) if prop_down[0]: if accum[0]: g_x0 -= g_dx1 * dy * x1_inv_square else: g_x0.copy_from(-g_dx1 * dy * x1_inv_square) if prop_down[1]: if accum[1]: g_x1 += dy * (g_dx1 * 2 * x0 * F.pow_scalar(x1, -3.0) - g_dx0 * x1_inv_square) else: g_x1.copy_from(dy * (2 * g_dx1 * x0 * F.pow_scalar(x1, -3.0) - g_dx0 * x1_inv_square)) if prop_down[2]: if accum[2]: g_dy += g_dx0 / x1 - g_dx1 * x0 * x1_inv_square else: g_dy.copy_from(g_dx0 / x1 - g_dx1 * x0 * x1_inv_square)
def _focal_loss(pred, gt): '''Modified focal loss. Exactly the same as CornerNet. Modified for more stability by using log_sigmoid function Arguments: pred (batch x c x h x w): logit (must be values before sigmoid activation) gt_regr (batch x c x h x w) ''' alpha = 2 beta = 4 pos_inds = F.greater_equal_scalar(gt, 1) neg_inds = 1 - pos_inds neg_weights = F.pow_scalar(1.0 - gt, beta) prob_pred = F.sigmoid(pred) pos_loss = F.log_sigmoid(pred) * F.pow_scalar(1.0 - prob_pred, alpha) * pos_inds pos_loss = F.sum(pos_loss) neg_loss = F.log_sigmoid(-pred) * F.pow_scalar( prob_pred, alpha) * neg_weights * neg_inds neg_loss = F.sum(neg_loss) num_pos = F.maximum_scalar(F.sum(pos_inds), 1) loss = -(1 / num_pos) * (pos_loss + neg_loss) return loss
def spectrogram(wave, window_size): """Computes the spectrogram from the waveform. Args: wave (nn.Variable): Input waveform of shape (B, 1, L). window_size (int): Window size. Returns: nn.Variable: The square spectrogram. """ re, im = stft(wave, window_size=window_size, stride=window_size // 4, fft_size=window_size) return F.pow_scalar(re**2 + im**2, 0.5)
def IN(inp, axes=[1], decay_rate=0.9, eps=1e-5, fix_parameters=True): """Instance Normalization """ if inp.shape[0] == 1: return INByBatchNorm(inp, axes, decay_rate, eps, fix_parameters) b, c = inp.shape[0:2] spacial_shape = inp.shape[2:] shape_stat = [1 for _ in inp.shape] shape_stat[axes[0]] = inp.shape[axes[0]] beta = get_parameter_or_create("beta", shape_stat, ConstantInitializer(0), not fix_parameters) gamma = get_parameter_or_create("gamma", shape_stat, ConstantInitializer(1), not fix_parameters) # Instance normalization # normalize over spatial dimensions axis = [i for i in range(len(inp.shape)) if i > 1] mean = F.sum(inp, axis=axis, keepdims=True) / np.prod(axis) var = F.pow_scalar(F.sum(inp - mean, axis=axis, keepdims=True), 2.0) / np.prod(axis) h = (inp - mean) / F.pow_scalar(var + eps, 0.5) return gamma * inp + beta
def gradient_clipping(params, max_norm, norm_type=2): params = list(filter(lambda p: p.need_grad == True, params)) norm_type = float(norm_type) if norm_type == float('inf'): total_norm = max(np.abs(p.g).max() for p in params) else: total_norm = 0. for p in params: param_norm = F.pow_scalar(F.sum(p.grad**norm_type), 1. / norm_type) total_norm += param_norm**norm_type total_norm = total_norm**(1. / norm_type) clip_coeff = max_norm / (float(total_norm.data) + 1e-6) if clip_coeff < 1: for p in params: p.g = p.g * clip_coeff
def __pow__(self, other): """ Element-wise power function. Implements the power operator expression ``A ** B``, together with :func:`~nnabla.variable.__rpow__` . When a scalar is specified for ``other``, this function performs an element-wise operation for all elements in ``self``. Args: other (float or ~nnabla.Variable): Internally calling :func:`~nnabla.functions.pow2` or :func:`~nnabla.functions.pow_scalar` according to the type. Returns: :class:`nnabla.Variable` """ import nnabla.functions as F if isinstance(other, Variable): return F.pow2(self, other) return F.pow_scalar(self, other)
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) r = F.mean(F.abs(h - one)) return r