def inplace_function_test_helper(inputs, func, func_args=[], func_kwargs={}, ctx=None, rng=None): if rng is None: rng = np.random.RandomState(313) if ctx is None: ctx = nn.Context() with nn.context_scope(ctx): a_s = [inp * 1.0 for inp in inputs] y = func(*(a_s + list(func_args)), inplace=False, **func_kwargs) l = F.sum(y) a_s_i = [inp * 1.0 for inp in inputs] y_i = func(*(a_s_i + list(func_args)), inplace=True, **func_kwargs) l_i = F.sum(y_i) data = [(rng.randn(*inp.shape), rng.randn(*inp.shape)) for inp in inputs] for i in range(len(data)): inputs[i].d = data[i][0] inputs[i].g = data[i][1] l.forward() l.backward() grads = [inp.g.copy() for inp in inputs] for i in range(len(data)): inputs[i].d = data[i][0] inputs[i].g = data[i][1] l_i.forward() l_i.backward() grads_i = [inp.g.copy() for inp in inputs] for g, g_i in zip(grads, grads_i): assert np.allclose(g, g_i)
def double_backward_for_global(g_dx0, g_db0, g_dg0, dy, x0, b0, g0, rm, rv, axes, decay_rate, eps): # Prerequisite # axes reduced and denominator axes0 = [a for a in range(x0.ndim)] axes = list(set(axes0) - set(axes)) # (variance + eps) * (-1/2) v_eps_rsqrt1 = (rv + eps) ** (-1.0 / 2.0) # wrt. x g_x0 = g_dg0 * dy * v_eps_rsqrt1 # wrt. beta # zero, do nothing # wrt. gamma g_g0 = F.sum(g_dx0 * dy * v_eps_rsqrt1, axes, True) # no backward wrt. rm and rv # wrt. dy g_dy = g_dx0 * g0 * v_eps_rsqrt1 \ + g_dg0 * (x0 - rm) * v_eps_rsqrt1 + g_db0 return g_dy, g_x0, None, g_g0
def build_train_graph(self, batch): self.solver = S.Adam(self.learning_rate) obs, action, reward, terminal, newobs = batch # Create input variables s = nn.Variable(obs.shape) a = nn.Variable(action.shape) r = nn.Variable(reward.shape) t = nn.Variable(terminal.shape) snext = nn.Variable(newobs.shape) with nn.parameter_scope(self.name_q): q = self.q_builder(s, self.num_actions, test=False) self.solver.set_parameters(nn.get_parameters()) with nn.parameter_scope(self.name_qnext): qnext = self.q_builder(snext, self.num_actions, test=True) qnext.need_grad = False clipped_r = F.minimum_scalar(F.maximum_scalar(r, -self.clip_reward), self.clip_reward) q_a = F.sum(q * F.one_hot(F.reshape(a, (-1, 1), inplace=False), (q.shape[1], )), axis=1) target = clipped_r + self.gamma * (1 - t) * F.max(qnext, axis=1) loss = F.mean(F.huber_loss(q_a, target)) Variables = namedtuple('Variables', ['s', 'a', 'r', 't', 'snext', 'q', 'loss']) self.v = Variables(s, a, r, t, snext, q, loss) self.sync_models() self.built = True
def norm_normalization_backward(inputs, p=None, axes=None, eps=1e-12): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # x_norm = x_sum ** (1./p) # Div2 backward dx = dy * x_sum**(-1. / p) dx_norm = -dy * x0 * x_sum**(-2. / p) dx_norm = sum_for_arithmetics(dx_norm, x_sum) # Norm backward x_sign = no_grad(F.sign(x0)) dx += dx_norm * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def get_model(args, num_classes, test=False, channel_last=False, with_error=True): """ Create computation graph and variables. """ nn_in_size = 224 if channel_last: image = nn.Variable([args.batch_size, nn_in_size, nn_in_size, 4]) else: image = nn.Variable([args.batch_size, 4, nn_in_size, nn_in_size]) label = nn.Variable([args.batch_size, 1]) pred, hidden = model_resnet_nhwc.resnet_imagenet(image, num_classes, args.num_layers, args.shortcut_type, test=test, tiny=False, channel_last=channel_last) pred.persistent = True loss = F.mean(loss_function(pred, label, args.label_smoothing)) error = F.sum(F.top_n_error(pred, label, n=1)) Model = namedtuple('Model', ['image', 'label', 'pred', 'loss', 'error', 'hidden']) return Model(image, label, pred, loss, error, hidden)
def affine_backward(inputs, base_axis=1): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] w0 = inputs[2] base_axis += inputs[0].ndim * (base_axis < 0) ctx = nn.get_current_context() dfx = AffineDataGrad(ctx, base_axis) dfw = AffineFilterGrad(ctx, base_axis) dfx.xshape = x0.shape dfw.wshape = w0.shape dx0 = dfx(dy, w0) dw0 = dfw(dy, x0) if len(inputs) == 4: axes = [i for i in range(0, base_axis)] db0 = F.sum(dy, axes, keepdims=False) return dx0, dw0, db0 else: return dx0, dw0
def sigma_regularization(ctx, log_var, one): with nn.context_scope(ctx): h = F.exp(log_var) h = F.pow_scalar(h, 0.5) b = log_var.shape[0] r = F.sum(F.squared_error(h, one)) / b return r
def disparityregression(x, maxdisp): disp = nn.Variable((x.shape), need_grad=False) for i in range(0, maxdisp): disp.d[:, :, i, :, :] = i dispx = F.mul2(disp, x) out = F.sum(dispx, axis=2) return out
def norm_backward(inputs, p=None, axes=None, keep_dims=False): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] if p is None: p = 2.0 axes = list(range(x0.ndim)) if axes is None else force_list(axes) x_abs = F.abs(x0) x_pow = F.pow_scalar(x_abs, p) x_sum = F.sum(x_pow, axes, keepdims=True) # Add axis for mul2 if not keep_dims: shape = list(x0.shape) for a in axes: shape[a] = 1 dy = dy.reshape(shape) x_sign = no_grad(F.sign(x0)) dx = dy * x_sum**(1. / p - 1.) * x_abs**(p - 1.) * x_sign return dx
def softmax_cross_entropy_with_label_smoothing(pred, label, label_smoothing=0.1): ''' Defines softmax activation followed by Cross entropy loss and label smoothing. Label smoothing loss is added by the following weight: `(1 - label_smoothing) * xent_loss + label_smoothing * label_smoothing_loss` Args: pred (Variable): Logits with a shape of `(batch_size, num_classes)`. label (Variable): A class index for each example if a shape of `(batch_size, 1`) is given, and a one-hot or probability over classes if `(batch_size, num_classes)`. label_smoothing (float): Coefficient of label smoothing loss. If 0, it omits label smoothing. ''' logp = None if label.shape[1] > 1: # If mixup is enabled, we suppose the label shape is (batch_size, num_class) logp = F.log_softmax(pred) l = F.sum(-label * logp, axis=1, keepdims=True) else: l = F.softmax_cross_entropy(pred, label) return apply_label_smoothing(l, pred, label_smoothing, logp)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axes = self.forward_func.info.args["axes"] keep_dims = self.forward_func.info.args["keep_dims"] # Inputs x0 = inputs[0].data dy = inputs[1].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_dy = inputs[1].grad # Grads of outputs g_dx0 = outputs[0].grad # Computation if prop_down[1]: g_dy_ = F.sum(g_dx0, axes, keep_dims) if accum[1]: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axes = self.forward_func.info.args["axes"] # Inputs x0 = inputs[0].data dy = inputs[1].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_dy = inputs[1].grad # Grads of outputs g_dx0 = outputs[0].grad # Compute # TODO: Optimize by creating max_pooling with indeces if prop_down[1]: # dx0 is not accumulated on the backward graph mask = F.not_equal_scalar(dx0, 0.0) g_dy_ = F.sum(g_dx0 * mask, axes) if accum[1]: g_dy += g_dy_ else: g_dy.copy_from(g_dy_)
def jacobian(self, coordinates): new_coordinates = self.warp_coordinates(coordinates) new_coordinates_x = F.slice(new_coordinates, start=(0, 0, 0), stop=new_coordinates.shape[:2] + (1, )) grad_x = nn.grad([F.sum(new_coordinates_x)], [coordinates]) new_coordinates_y = F.slice(new_coordinates, start=(0, 0, 1), stop=new_coordinates.shape[:2] + (2, )) grad_y = nn.grad([F.sum(new_coordinates_y)], [coordinates]) gx = F.reshape(grad_x[0], grad_x[0].shape[:-1] + (1, ) + grad_x[0].shape[-1:]) gy = F.reshape(grad_y[0], grad_y[0].shape[:-1] + (1, ) + grad_y[0].shape[-1:]) jacobian = F.concatenate(gx, gy, axis=gy.ndim - 2) return jacobian
def kl_divergence(ctx, pred, label, log_var): with nn.context_scope(ctx): s = F.pow_scalar(F.exp(log_var), 0.5) elms = softmax_with_temperature(ctx, label, s) \ * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def siamese_loss(e0, e1, t, margin=1.0, eps=1e-4): dist = F.sum(F.squared_error(e0, e1), axis=1) # Squared distance # Contrastive loss sim_cost = t * dist dissim_cost = (1 - t) * (F.maximum_scalar(margin - (dist + eps)**(0.5), 0)**2) return F.mean(sim_cost + dissim_cost)
def build_model(): x = nn.Variable((batch_size, sentence_length_source)) input_mask = F.sign( F.reshape(F.slice(x), (batch_size, sentence_length_source, 1))) y = nn.Variable((batch_size, sentence_length_target)) enc_input = time_distributed(PF.embed)(x, vocab_size_source, embedding_size, name='enc_embeddings') #*input_mask # -> (batch_size, sentence_length_source, embedding_size) dec_input = time_distributed(PF.embed)(y, vocab_size_target, embedding_size, name='dec_embeddings') # -> (batch_size, sentence_length_target, embedding_size) # encoder with nn.parameter_scope('encoder'): output, c, h = LSTMEncoder(enc_input, hidden, return_sequences=True, return_state=True) # -> (batch_size, sentence_length_source, hidden), (batch_size, hidden), (batch_size, hidden) # decoder output = LSTMAttentionDecoder(dec_input, output, initial_state=(c, h), return_sequences=True, name='decoder') # -> (batch_size, sentence_length_target, hidden) output = time_distributed(PF.affine)(output, vocab_size_target, name='output') # -> (batch_size, sentence_length_target, vocab_size_target) t = F.reshape(F.slice(y), (batch_size, sentence_length_target, 1)) entropy = time_distributed_softmax_cross_entropy(output, t) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. count = F.sum(mask, axis=1) entropy *= mask loss = F.mean(F.sum(entropy, axis=1) / count) return x, y, loss
def mnist_lenet_siamese(x0, x1, test=False): """""" h0 = mnist_lenet_feature(x0, test) h1 = mnist_lenet_feature(x1, test) # share weights # h = (h0 - h1) ** 2 # equivalent h = F.squared_error(h0, h1) p = F.sum(h, axis=1) return p
def deconvolution_backward(inputs, base_axis=1, pad=None, stride=None, dilation=None, group=1, channel_last=False, output_padding=None): """ Args: inputs (list of nn.Variable): Incomming grads/inputs to/of the forward function. kwargs (dict of arguments): Dictionary of the corresponding function arguments. Return: list of Variable: Return the gradients wrt inputs of the corresponding function. """ dy = inputs[0] x0 = inputs[1] w0 = inputs[2] base_axis += x0.ndim * (base_axis < 0) # base_axis += inputs[0].ndim*(base_axis < 0) ctx = nn.get_current_context() dfx = DeconvolutionDataGrad(ctx, base_axis, pad, stride, dilation, group, channel_last, output_padding) dfw = DeconvolutionFilterGrad(ctx, base_axis, pad, stride, dilation, group, channel_last, output_padding) dfx.xshape = x0.shape dfw.wshape = w0.shape dx0 = dfx(dy, w0) dw0 = dfw(dy, x0) axes = [i for i in range(dy.ndim, base_axis)] db0 = F.sum(dy, axes, keepdims=False) if len(inputs) == 4 else None if len(inputs) == 4: if channel_last: axes = [i for i in range(dy.ndim - 1)] else: axes = [i for i in range(0, base_axis)] + \ [i for i in range(base_axis + 1, dy.ndim)] db0 = F.sum(dy, axes, keepdims=False) if len(inputs) == 4 else None return dx0, dw0, db0 else: return dx0, dw0
def sum(xs, shape=None, axis=None): assert (len(xs) > 0 or shape is not None) if len(xs) == 0: x = nn.Variable(shape) x.data.data = 0 return x else: return F.sum(stack(xs), axis=axis)
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] axis = self.forward_func.info.args["axis"] # Inputs x0 = inputs[0].data y0 = inputs[1].data dy = inputs[2].data # Outputs dx0 = outputs[0].data # Grads of inputs g_x0 = inputs[0].grad g_y0 = inputs[1].grad g_dy = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad # w.r.t. x0 if prop_down[0]: gdx_y = g_dx0 * y0 gdx_dy_y = gdx_y * dy dy_y = dy * y0 gdx_y_sum = F.sum(gdx_y, axis, True) dy_y_sum = F.sum(dy_y, axis, True) gdx_dy_y_sum = F.sum(gdx_dy_y, axis, True) t1 = gdx_dy_y t2 = y0 * gdx_dy_y_sum t3 = gdx_y_sum * (y0 * dy_y_sum - dy_y) t4 = dy_y_sum * (y0 * gdx_y_sum - gdx_y) g_x0_ = t1 - t2 + t3 + t4 if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) # w.r.t. dy if prop_down[2]: si = nn.Variable(x0.shape).apply(data=x0, grad=g_dy, need_grad=True) so = nn.Variable(dx0.shape).apply(data=y0, grad=g_dx0) self.forward_func.backward([si], [so], accum=[accum[2]])
def backward_impl(self, inputs, outputs, prop_down, accum): # inputs: [inputs_fwd_graph] + [inputs_bwd_graph] or # [inputs_fwd_graph] + [outputs_fwd_graph] + [inputs_bwd_graph] # Args axis = self.forward_func.info.args["axis"] # Inputs x0 = inputs[0].data # logits t0 = inputs[1].data # labels dz = inputs[2].data # grad_input # Outputs dx0 = outputs[0].data dt0 = outputs[1].data # Grads of inputs g_x0 = inputs[0].grad g_t0 = inputs[1].grad g_dz = inputs[2].grad # Grads of outputs g_dx0 = outputs[0].grad g_dt0 = outputs[1].grad # Computation ## w.r.t. x0 if prop_down[0]: # gradient is the backward of softmax with (g_dx0 * dz) as in-coming gradient si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) si.grad.fill(0.0) so = F.softmax(si, axis) if not nn.get_auto_forward(): so.forward() so.backward(g_dx0 * dz, clear_buffer=False) g_x0_ = si.grad if accum[0]: g_x0 += g_x0_ else: g_x0.copy_from(g_x0_) ## w.r.t. t0 is not required ## w.r.t. dz if prop_down[2]: # Instable implementation since using `/ dz` ## g_dz_ = g_dx0 * dx0 / dz ## g_dz_ = F.sum(g_dz_, axis) shape = dz.shape if dz.shape != [] else [1] si = nn.Variable(x0.shape).apply(data=x0, need_grad=True) ti = nn.Variable(t0.shape).apply(data=t0) o = nn.Variable(shape) o.grad.fill(1.0) self.forward_func.backward([si, ti], [o], [False, False]) # Sum g_dx0_i * (y_hat_i - y_i) over i g_dz_ = F.sum(g_dx0 * si.grad, axis) if accum[2]: g_dz += g_dz_ else: g_dz.copy_from(g_dz_)
def detect_keypoint(x, block_expansion, num_kp, num_channels, max_features, num_blocks, temperature, estimate_jacobian=False, scale_factor=1, single_jacobian_map=False, pad=0, test=False, comm=None): if scale_factor != 1: x = anti_alias_interpolate(x, num_channels, scale_factor) with nn.parameter_scope("hourglass"): feature_map = hourglass(x, block_expansion, num_blocks=num_blocks, max_features=max_features, test=test, comm=comm) with nn.parameter_scope("keypoint_detector"): inmaps, outmaps = feature_map.shape[1], num_kp k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) prediction = PF.convolution(feature_map, outmaps=num_kp, kernel=(7, 7), pad=(pad, pad), w_init=w_init, b_init=b_init) final_shape = prediction.shape heatmap = F.reshape(prediction, (final_shape[0], final_shape[1], -1)) heatmap = F.softmax(heatmap / temperature, axis=2) heatmap = F.reshape(heatmap, final_shape, inplace=False) out = gaussian2kp(heatmap) # {"value": value}, keypoint positions. if estimate_jacobian: if single_jacobian_map: num_jacobian_maps = 1 else: num_jacobian_maps = num_kp with nn.parameter_scope("jacobian_estimator"): jacobian_map = PF.convolution(feature_map, outmaps=4*num_jacobian_maps, kernel=(7, 7), pad=(pad, pad), w_init=I.ConstantInitializer(0), b_init=np.array([1, 0, 0, 1]*num_jacobian_maps)) jacobian_map = F.reshape( jacobian_map, (final_shape[0], num_jacobian_maps, 4, final_shape[2], final_shape[3])) heatmap = F.reshape( heatmap, heatmap.shape[:2] + (1,) + heatmap.shape[2:], inplace=False) jacobian = heatmap * jacobian_map jacobian = F.sum(jacobian, axis=(3, 4)) jacobian = F.reshape( jacobian, (jacobian.shape[0], jacobian.shape[1], 2, 2), inplace=False) out['jacobian'] = jacobian # jacobian near each keypoint. # out is a dictionary containing {"value": value, "jacobian": jacobian} return out
def sr_loss_with_uncertainty(ctx, pred0, pred1, log_var0, log_var1): #TODO: squared error/absolute error s0 = F.exp(log_var0) s1 = F.exp(log_var1) squared_error = F.squared_error(pred0, pred1) b = pred0.shape[0] with nn.context_scope(ctx): loss_sr = F.sum(squared_error * (1 / s0 + 1 / s1) + (s0 / s1 + s1 / s0)) * 0.5 / b return loss_sr
def er_loss(ctx, pred): with nn.context_scope(ctx): bs = pred.shape[0] d = np.prod(pred.shape[1:]) denominator = bs * d pred_normalized = F.softmax(pred) pred_log_normalized = F.log(F.softmax(pred)) loss_er = -F.sum(pred_normalized * pred_log_normalized) / denominator return loss_er
def sum_grad_norm(params): norm = nn.NdArray() norm.zero() for p in params: assert isinstance(p, nn.Variable) and not p.grad.clear_called norm += F.sum(p.grad**2) return np.sqrt(norm.data)
def er_loss(ctx, pred): with nn.context_scope(ctx): bs = pred.shape[0] d = np.prod(pred.shape[1:]) denominator = bs * d pred_normalized = F.softmax(pred) pred_log_normalized = F.log(F.softmax(pred)) loss_er = - F.sum(pred_normalized * pred_log_normalized) / denominator return loss_er
def _build(self): # infer variable self.infer_obs_t = nn.Variable((1, 4, 84, 84)) # inference output self.infer_q_t = self.q_function(self.infer_obs_t, self.num_actions, scope='q_func') # train variables self.obss_t = nn.Variable((self.batch_size, 4, 84, 84)) self.acts_t = nn.Variable((self.batch_size, 1)) self.rews_tp1 = nn.Variable((self.batch_size, 1)) self.obss_tp1 = nn.Variable((self.batch_size, 4, 84, 84)) self.ters_tp1 = nn.Variable((self.batch_size, 1)) self.weights = nn.Variable((self.batch_size, 1)) # training output q_t = self.q_function(self.obss_t, self.num_actions, scope='q_func') q_tp1 = self.q_function(self.obss_tp1, self.num_actions, scope='target_q_func') # select one dimension a_t_one_hot = F.one_hot(self.acts_t, (self.num_actions, )) q_t_selected = F.sum(q_t * a_t_one_hot, axis=1, keepdims=True) q_tp1_best = F.max(q_tp1, axis=1, keepdims=True) # loss calculation y = self.rews_tp1 + self.gamma * q_tp1_best * (1.0 - self.ters_tp1) self.td = q_t_selected - y self.loss = F.sum(F.huber_loss(q_t_selected, y) * self.weights) self.loss_sink = F.sink(self.td, self.loss) # optimizer self.solver = S.RMSprop(self.lr, 0.95, 1e-2) # weights and biases with nn.parameter_scope('q_func'): self.params = nn.get_parameters() with nn.parameter_scope('target_q_func'): self.target_params = nn.get_parameters() # set q function parameters to solver self.solver.set_parameters(self.params)
def _spectral_norm_outer_most_dim_backward(dw_sn, w, u, itr=1, eps=1e-12): # Forward recomputation w_shape = w.shape d0 = np.prod(w.shape[0:-1]) # In d1 = w.shape[-1] # Out w = F.reshape(w, [d0, d1]) u = F.reshape(u, [d1, 1]) # Power method for _ in range(itr): # v v = F.affine(w, u) v = v / ((F.sum(v ** 2.0, keepdims=True) + eps) ** 0.5) v = F.reshape(v, [1, d0]) # u u = F.affine(v, w) u = u / ((F.sum(u ** 2.0, keepdims=True) + eps) ** 0.5) u = F.reshape(u, [d1, 1]) # No grad u = no_grad(u) v = no_grad(v) # Spectral normalization vw = F.affine(v, w) sigma = F.affine(vw, u) w_sn = w / sigma # The fowllowing process is not necessary for gradient calculation # w_sn = F.reshape(w_sn, w_shape) # Backward for spectral norm dw_sn = dw_sn.reshape(w.shape) # Sum for broadcast backward S = sum_for_arithmetics(dw_sn * w_sn, sigma) # Add batch axis S = S.reshape((1,) + S.shape) u = u.reshape((1,) + u.shape) v = v.reshape((1,) + v.shape) m = F.batch_matmul(v, S, transpose_a=True) m = F.batch_matmul(m, u, transpose_b=True) # Remove batch axis m = m.reshape((m.shape[1], m.shape[2])) dw = (dw_sn - m) / sigma dw = dw.reshape(w_shape) return dw, None
def mask_weight(a, b): # much different from definition in the paper merged_mask = F.concatenate(a, b, axis=1) summed_mask = F.sum((merged_mask + 1) / 2, axis=1, keepdims=True) clipped = F.clip_by_value(summed_mask, F.constant(0, shape=summed_mask.shape), F.constant(1, shape=summed_mask.shape)) z = clipped * 2 - 1 mask = (1 - z) / 2 return mask
def call(self, input): if self._mode == 'full': out = F.stack(*[op(input) for op in self._ops], axis=0) out = F.mul2(out, F.softmax(self._alpha, axis=0)) return F.sum(out, axis=0) # update active index self._update_active_index() return self._ops[self._active](input)
def _calc_gradient_penalty(real, fake, discriminator): alpha = F.rand(shape=(1, 1, 1, 1)) interpolates = alpha * real + (1.0 - alpha) * fake interpolates.need_grad = True disc_interpolates = discriminator(x=interpolates) grads = nn.grad([disc_interpolates], [interpolates]) norms = [F.sum(g ** 2.0, axis=1) ** 0.5 for g in grads] return sum([F.mean((norm - 1.0) ** 2.0) for norm in norms])
def gaussian2kp(heatmap): shape = heatmap.shape heatmap = F.reshape(heatmap, shape + (1, ), inplace=False) grid = make_coordinate_grid(shape[2:]) grid = F.reshape(grid, (1, 1) + grid.shape) value = F.sum(heatmap * grid, axis=(2, 3)) kp = {'value': value} return kp
def callback(f): # For the first time to check this function. self._add_key(f, self.key_to_stat_bwd) # apply callback to check the outputs of this function has nan values or not. nan = [] if self.trace_nan: nan = [F.sum(F.isnan(i.grad)) for i in f.inputs] inf = [] if self.trace_inf: inf = [F.sum(F.isinf(i.grad)) for i in f.inputs] self.key_to_stat_bwd[f].update({ "inf": inf, "nan": nan, # rank might be changed between each iteration. "rank": f.rank, })
def kl_divergence(ctx, pred, label): with nn.context_scope(ctx): elms = F.softmax(label, axis=1) * F.log(F.softmax(pred, axis=1)) loss = -F.mean(F.sum(elms, axis=1)) return loss
def ce_loss_soft(ctx, pred, target): with nn.context_scope(ctx): #todo: devide or not loss = - F.mean(F.sum(F.softmax(target) * F.log(F.softmax(pred)), axis=1)) return loss
def main(): # Get arguments args = get_args() data_file = "https://raw.githubusercontent.com/tomsercu/lstm/master/data/ptb.train.txt" model_file = args.work_dir + "model.h5" # Load Dataset itow, wtoi, dataset = load_ptbset(data_file) # Computation environment settings from nnabla.contrib.context import extension_context extension_module = args.context if args.context is None: extension_module = 'cpu' logger.info("Running in %s" % extension_module) ctx = extension_context(extension_module, device_id=args.device_id) nn.set_default_context(ctx) # Create data provider n_word = len(wtoi) n_dim = args.embed_dim batchsize = args.batchsize half_window = args.half_window_length n_negative = args.n_negative_sample di = DataIteratorForEmbeddingLearning( batchsize=batchsize, half_window=half_window, n_negative=n_negative, dataset=dataset) # Create model # - Real batch size including context samples and negative samples size = batchsize * (1 + n_negative) * (2 * (half_window - 1)) # Model for learning # - input variables xl = nn.Variable((size,)) # variable for word yl = nn.Variable((size,)) # variable for context # Embed layers for word embedding function # - f_embed : word index x to get y, the n_dim vector # -- for each sample in a minibatch hx = PF.embed(xl, n_word, n_dim, name="e1") # feature vector for word hy = PF.embed(yl, n_word, n_dim, name="e1") # feature vector for context hl = F.sum(hx * hy, axis=1) # -- Approximated likelihood of context prediction # pos: word context, neg negative samples tl = nn.Variable([size, ], need_grad=False) loss = F.sigmoid_cross_entropy(hl, tl) loss = F.mean(loss) # Model for test of searching similar words xr = nn.Variable((1,), need_grad=False) hr = PF.embed(xr, n_word, n_dim, name="e1") # feature vector for test # Create solver solver = S.Adam(args.learning_rate) solver.set_parameters(nn.get_parameters()) # Create monitor. monitor = M.Monitor(args.work_dir) monitor_loss = M.MonitorSeries( "Training loss", monitor, interval=args.monitor_interval) monitor_time = M.MonitorTimeElapsed( "Training time", monitor, interval=args.monitor_interval) # Do training max_epoch = args.max_epoch for epoch in range(max_epoch): # iteration per epoch for i in range(di.n_batch): # get minibatch xi, yi, ti = di.next() # learn solver.zero_grad() xl.d, yl.d, tl.d = xi, yi, ti loss.forward(clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.update() # monitor itr = epoch * di.n_batch + i monitor_loss.add(itr, loss.d) monitor_time.add(itr) # Save model nn.save_parameters(model_file) # Evaluate by similarity max_check_words = args.max_check_words for i in range(max_check_words): # prediction xr.d = i hr.forward(clear_buffer=True) h = hr.d # similarity calculation w = nn.get_parameters()['e1/embed/W'].d s = np.sqrt((w * w).sum(1)) w /= s.reshape((s.shape[0], 1)) similarity = w.dot(h[0]) / s[i] # for understanding output_similar_words(itow, i, similarity)