def test_clip_by_value_forward(seed, shape, dtype): def convert(value): converter = dtype if dtype in (float, np.array) else dtype.from_numpy_array return converter(value) rng = np.random.RandomState(seed) x_data = rng.randn(*shape) x = nn.Variable.from_numpy_array(x_data) if dtype is float: min_data = rng.randn() max_data = rng.randn() else: min_data = rng.randn(*shape) max_data = rng.randn(*shape) min_ = convert(min_data) max_ = convert(max_data) if dtype is not np.array: with nn.auto_forward(True): y = F.clip_by_value(x, min_, max_) y_ref = ref_clip_by_value(x_data, min_data, max_data) if dtype in (nn.Variable, float): assert_allclose(y.d, y_ref) elif dtype is nn.NdArray: assert_allclose(y.data, y_ref) else: with pytest.raises(TypeError): y = F.clip_by_value(x, min_data, max_data)
def parametric_fixed_point_quantize_b_xmax(x, sign=True, n_init=8, n_min=2, n_max=16, xmax_init=1, xmax_min=0.001, xmax_max=10, fix_parameters=False): """Parametric version of `fixed_point_quantize` where the bitwidth `b` and dynamic range `xmax` are learnable parameters. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) xmax = get_parameter_or_create("xmax", (), ConstantInitializer(xmax_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n = F.round(clip_scalar(n, n_min, n_max)) if sign: n = n - 1 # ensure that dynamic range is in specified range xmax = clip_scalar(xmax, xmax_min, xmax_max) # compute step size from dynamic range and make sure that it is a pow2 d = quantize_pow2(xmax / (2**n - 1)) # compute min/max value that we can represent if sign: xmin = -xmax else: xmin = nn.Variable((1, ), need_grad=False) xmin.d = 0. # broadcast variables to correct size d = broadcast_scalar(d, shape=x.shape) xmin = broadcast_scalar(xmin, shape=x.shape) xmax = broadcast_scalar(xmax, shape=x.shape) # apply fixed-point quantization return d * F.round(F.clip_by_value(x, xmin, xmax) / d)
def policy_network(obs, action_size, name): with nn.parameter_scope(name): out = PF.affine(obs, 256, name='fc1') out = F.relu(out) out = PF.affine(out, 256, name='fc2') out = F.relu(out) mean = PF.affine(out, action_size, name='mean') logstd = PF.affine(out, action_size, name='logstd') clipped_logstd = F.clip_by_value(logstd, -20, 2) return Normal(mean, F.exp(clipped_logstd))
def mask_weight(a, b): # much different from definition in the paper merged_mask = F.concatenate(a, b, axis=1) summed_mask = F.sum((merged_mask + 1) / 2, axis=1, keepdims=True) clipped = F.clip_by_value(summed_mask, F.constant(0, shape=summed_mask.shape), F.constant(1, shape=summed_mask.shape)) z = clipped * 2 - 1 mask = (1 - z) / 2 return mask
def test_clip_by_value_forward(seed, shape): rng = np.random.RandomState(seed) x_data = rng.randn(*shape) min_data = rng.randn(*shape) max_data = rng.randn(*shape) x = nn.Variable.from_numpy_array(x_data) min_ = nn.Variable.from_numpy_array(min_data) max_ = nn.Variable.from_numpy_array(max_data) with nn.auto_forward(True): y = F.clip_by_value(x, min_, max_) y_ref = ref_clip_by_value(x_data, min_data, max_data) assert_allclose(y.d, y_ref)
def compute_mel(self, wave): hp = self.hparams reals, imags = F.stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = F.pow_scalar( F.add2(F.pow_scalar(reals, 2), F.pow_scalar(imags, 2)), 0.5) mels = F.batch_matmul(self.basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)).apply(need_grad=False) return mels
def build_train_graph(self, x, t=None, dropout=0, noise=None, loss_scaling=None): B, C, H, W = x.shape if self.randflip: x = F.random_flip(x) assert x.shape == (B, C, H, W) if t is None: t = F.randint(low=0, high=self.diffusion.num_timesteps, shape=(B, )) # F.randint could return high with very low prob. Workaround to avoid this. t = F.clip_by_value(t, min=0, max=self.diffusion.num_timesteps - 0.5) loss_dict = self.diffusion.train_loss(model=partial(self._denoise, dropout=dropout), x_start=x, t=t, noise=noise) assert isinstance(loss_dict, AttrDict) # setup training loss loss_dict.batched_loss = loss_dict.mse if is_learn_sigma(self.model_var_type): assert "vlb" in loss_dict loss_dict.batched_loss += loss_dict.vlb * 1e-3 # todo: implement loss aware sampler if loss_scaling is not None and loss_scaling > 1: loss_dict.batched_loss *= loss_scaling # setup flat training loss loss_dict.loss = F.mean(loss_dict.batched_loss) assert loss_dict.batched_loss.shape == t.shape == (B, ) # Keep interval values to compute loss for each quantile t.persistent = True for v in loss_dict.values(): v.persistent = True return loss_dict, t
def clip_quant_grads(): ps = nn.get_parameters(grad_only=False) for p in ps: if ((p.endswith("quantized_conv/W") or p.endswith("quantized_conv/b") or p.endswith("quantized_affine/W") or p.endswith("quantized_affine/b"))): if cfg.w_quantize == 'parametric_fp_d_xmax': d = ps[p + "quant/" + cfg.w_quantize + "/d"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] d.grad = F.clip_by_value(d.grad, -d.data, d.data) xmax.grad = F.clip_by_value(xmax.grad, -d.data, d.data) elif cfg.w_quantize == 'parametric_pow2_xmin_xmax': xmin = ps[p + "quant/" + cfg.w_quantize + "/xmin"] xmax = ps[p + "quant/" + cfg.w_quantize + "/xmax"] xmin.grad = F.clip_by_value(xmin.grad, -xmin.data, xmin.data) xmax.grad = F.clip_by_value(xmax.grad, -xmin.data, xmin.data) if 'Asize' in p.split('/'): if cfg.a_quantize == 'parametric_fp_d_xmax_relu': d = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/d")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] d.grad = F.clip_by_value(d.grad, -d.data, d.data) xmax.grad = F.clip_by_value(xmax.grad, -d.data, d.data) elif cfg.a_quantize == 'parametric_pow2_xmin_xmax_relu': xmin = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmin")] xmax = ps[p.replace( "/Asize", "/Aquant/" + cfg.a_quantize.replace("_relu", "") + "/xmax")] xmin.grad = F.clip_by_value(xmin.grad, -xmin.data, xmin.data) xmax.grad = F.clip_by_value(xmax.grad, -xmin.data, xmin.data)
def compute_mel(wave, basis, hp): r"""Compute the mel-spectrogram from the waveform. Args: wave (nn.Variable): Wavefrom variable of shape (B, 1, L). basis (nn.Variable): Basis for mel-spectrogram computation. hp (HParams): Hyper-parameters. Returns: nn.Variable: Output variable. """ reals, imags = stft(wave, window_size=hp.win_length, stride=hp.hop_length, fft_size=hp.n_fft) linear = (reals**2 + imags**2)**0.5 mels = F.batch_matmul(basis, linear) mels = F.log(F.clip_by_value(mels, 1e-5, np.inf)) return mels
def __call__(self, batch_size, style_noises, truncation_psi=1.0, return_latent=False, mixing_layer_index=None, dlatent_avg_beta=0.995): with nn.parameter_scope(self.global_scope): # normalize noise inputs for i in range(len(style_noises)): style_noises[i] = F.div2( style_noises[i], F.pow_scalar(F.add_scalar(F.mean(style_noises[i]**2., axis=1, keepdims=True), 1e-8, inplace=False), 0.5, inplace=False)) # get latent code w = [ mapping_network(style_noises[0], outmaps=self.mapping_network_dim, num_layers=self.mapping_network_num_layers) ] w += [ mapping_network(style_noises[1], outmaps=self.mapping_network_dim, num_layers=self.mapping_network_num_layers) ] dlatent_avg = nn.parameter.get_parameter_or_create( name="dlatent_avg", shape=(1, 512)) # Moving average update of dlatent_avg batch_avg = F.mean((w[0] + w[1]) * 0.5, axis=0, keepdims=True) update_op = F.assign( dlatent_avg, lerp(batch_avg, dlatent_avg, dlatent_avg_beta)) update_op.name = 'dlatent_avg_update' dlatent_avg = F.identity(dlatent_avg) + 0 * update_op # truncation trick w = [lerp(dlatent_avg, _, truncation_psi) for _ in w] # generate output from generator constant_bc = nn.parameter.get_parameter_or_create( name="G_synthesis/4x4/Const/const", shape=(1, 512, 4, 4), initializer=np.random.randn(1, 512, 4, 4).astype(np.float32)) constant_bc = F.broadcast(constant_bc, (batch_size, ) + constant_bc.shape[1:]) if mixing_layer_index is None: mixing_layer_index_var = F.randint(1, len(self.resolutions) * 2, (1, )) else: mixing_layer_index_var = F.constant(val=mixing_layer_index, shape=(1, )) mixing_switch_var = F.clip_by_value( F.arange(0, len(self.resolutions) * 2) - mixing_layer_index_var, 0, 1) mixing_switch_var_re = F.reshape( mixing_switch_var, (1, mixing_switch_var.shape[0], 1), inplace=False) w0 = F.reshape(w[0], (batch_size, 1, w[0].shape[1]), inplace=False) w1 = F.reshape(w[1], (batch_size, 1, w[0].shape[1]), inplace=False) w_mixed = w0 * mixing_switch_var_re + \ w1 * (1 - mixing_switch_var_re) rgb_output = self.synthesis(w_mixed, constant_bc) if return_latent: return rgb_output, w_mixed else: return rgb_output
def p_mean_var(self, model, x_t, t, clip_denoised=True): """ Compute mean and var of p(x_{t-1}|x_t) from model. Args: model (Callable): A callbale that takes x_t and t and predict noise (and more). x_t (nn.Variable): The (B, C, ...) tensor at timestep t (x_t). t (nn.Variable): A 1-D tensor of timesteps. The first axis represents batchsize. clip_denoised (bool): If True, clip the denoised signal into [-1, 1]. Returns: An AttrDict containing the following items: "mean": the mean predicted by model. "var": the variance predicted by model (or pre-defined variance). "log_var": the log of "var". "xstart": the x_0 predicted from x_t and t by model. """ B, C, H, W = x_t.shape assert t.shape == (B, ) pred = model(x_t, t) if self.model_var_type == ModelVarType.LEARNED_RANGE: assert pred.shape == (B, 2 * C, H, W) pred_noise, pred_var_coeff = chunk(pred, num_chunk=2, axis=1) min_log = self._extract( self.posterior_log_var_clipped, t, x_t.shape) max_log = F.log(self._extract(self.betas, t, x_t.shape)) # pred_var_coeff should be [0, 1] v = F.sigmoid(pred_var_coeff) model_log_var = v * max_log + (1 - v) * min_log model_var = F.exp(model_log_var) else: # Model only predicts noise pred_noise = pred model_log_var, model_var = { ModelVarType.FIXED_LARGE: lambda: ( self._extract(self.log_betas_clipped, t, x_t.shape), self._extract(self.betas_clipped, t, x_t.shape) ), ModelVarType.FIXED_SMALL: lambda: ( self._extract( self.posterior_log_var_clipped, t, x_t.shape), self._extract(self.posterior_var, t, x_t.shape) ) }[self.model_var_type]() x_recon = self.predict_xstart_from_noise( x_t=x_t, t=t, noise=pred_noise) if clip_denoised: x_recon = F.clip_by_value(x_recon, -1, 1) model_mean, _, _ = self.q_posterior(x_start=x_recon, x_t=x_t, t=t) assert model_mean.shape == x_recon.shape == x_t.shape assert model_mean.shape == model_var.shape == model_log_var.shape or \ (model_mean.shape[0] == model_var.shape[0] == model_log_var.shape[0] and model_var.shape[1:] == ( 1, 1, 1) and model_log_var.shape[1:] == (1, 1, 1)) # returns ret = AttrDict() ret.mean = model_mean ret.var = model_var ret.log_var = model_log_var ret.xstart = x_recon return ret
with nn.parameter_scope('central_bias'): central_bias = PF.embed(x_central, vocab_size, 1) with nn.parameter_scope('context_bias'): context_bias = PF.embed(x_context, vocab_size, 1) dot_product = F.reshape(F.batch_matmul( F.reshape(central_embedding, shape=(batch_size, 1, embedding_size)), F.reshape(context_embedding, shape=(batch_size, embedding_size, 1))), shape=(batch_size, 1)) prediction = dot_product + central_bias + context_bias t = nn.Variable((batch_size, 1)) zero = F.constant(0, shape=(batch_size, 1)) one = F.constant(1, shape=(batch_size, 1)) weight = F.clip_by_value(t / 100, zero, one)**0.75 loss = F.sum(weight * ((prediction - F.log(t))**2)) # Create solver. solver = S.Adam() solver.set_parameters(nn.get_parameters()) # Create monitor monitor = M.Monitor('./log') monitor_loss = M.MonitorSeries("Training loss", monitor, interval=1000) monitor_valid_loss = M.MonitorSeries("Validation loss", monitor, interval=1) monitor_time = M.MonitorTimeElapsed("Training time", monitor, interval=1000) # Create updater def train_data_feeder():
def parametric_fixed_point_quantize(x, sign=True, n_init=8, n_min=2, n_max=16, m_init=1, m_min=-8, m_max=8, fix_parameters=False): """Parametric version of `fixed_point_quantize` where the bitwidth `n` and dynamic range `m` are learnable parameters. Args: x(~nnabla.Variable): N-D array as input sign (bool): keep sign information during quantization. n_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for bitwidth parameter. n_min (int): lower bound for bitwidth. n_max (int): upper bound for bitwidth. m_init (:obj:`nnabla.initializer.BaseInitializer` or :obj:`numpy.ndarray`): Initializer for dynamic range. m_min (float): lower bound for dynamic range. m_max (float): upper bound for range. fix_parameters (bool): When set to `True`, the negative slope values will not be updated. Returns: ~nnabla.Variable: N-D array. """ def clip_scalar(v, min_value, max_value): return F.minimum_scalar(F.maximum_scalar(v, min_value), max_value) def broadcast_scalar(v, shape): return F.broadcast(F.reshape(v, (1, ) * len(shape), inplace=False), shape=shape) def quantize_pow2(v): return 2**F.round(F.log(v) / np.log(2.)) n = get_parameter_or_create("n", (), ConstantInitializer(n_init), need_grad=True, as_need_grad=not fix_parameters) m = get_parameter_or_create("m", (), ConstantInitializer(m_init), need_grad=True, as_need_grad=not fix_parameters) # ensure that bitwidth is in specified range and an integer n_q = F.round(clip_scalar(n, n_min, n_max)) if sign: n_q = n_q - 1 # ensure that dynamic range is in specified range m_q = clip_scalar(m, m_min, m_max) # compute step size from dynamic range and make sure that it is a pow2 d_q = quantize_pow2((2**m_q) / (2**n_q - 1)) # compute min/max value that we can represent x_max = d_q * (2**n_q - 1) if sign: x_min = -x_max else: x_min = nn.Variable((1, ), need_grad=False) x_min.d = 0. # broadcast variables to correct size d_q = broadcast_scalar(d_q, shape=x.shape) x_min = broadcast_scalar(x_min, shape=x.shape) x_max = broadcast_scalar(x_max, shape=x.shape) # apply fixed-point quantization return d_q * F.round(F.clip_by_value(x, x_min, x_max) / d_q)
def clamp(val): # Here we don't need to clip max return F.clip_by_value(val, min=1e-12, max=1e8)