def compute_loss(self, input_vocab, output_vocab, window_words, hidden_states): g, rnn_distribution, a = self.decode_one_step(input_vocab, window_words, hidden_states) # define p_vocab as 0 if output word is not in vocab p_vocab = F.select_item( rnn_distribution, xp.array( [self.vocab[output_vocab]], dtype=xp.int32)) if output_vocab in self.vocab else Variable( xp.array([0.0], dtype=xp.float32)) # compute cross entropy indexes = [i for i, x in enumerate(window_words) if x == output_vocab] exist_var = Variable(xp.array([0], dtype=xp.float32)) for idx in indexes: exist_var += F.select_item(a, xp.array([idx], dtype=xp.int32)) p_ptr = F.cast(exist_var, xp.float32) if indexes else Variable( xp.array([0.0], dtype=xp.float32)) cross_entropy = -F.log( F.linear_interpolate(g, p_vocab, p_ptr) + Variable(xp.array([0.01], dtype=xp.float32))) # compute attention loss attention_loss = F.cast(-F.log(g + exist_var), xp.float32) if indexes else Variable( xp.array([0.0], dtype=xp.float32)) return cross_entropy + attention_loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" reward = F.cast(l_rew, np.float32) q_forwarded = self._q.forward(l_next_obs) qt_forwarded = self._qt.forward(l_next_obs) y_non_terminal = reward + self._discount * F.select_item( qt_forwarded, F.argmax(q_forwarded, axis=1)) y_terminal = reward y = F.select_item(F.stack([y_non_terminal, y_terminal], axis=1), F.cast(l_done, np.int32)) Q = F.select_item(self._q.forward(l_obs), l_act) return F.mean(F.square(y - Q))
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful # TODO: replace this line feed_forward_learner = self._q.forward(l_obs) q_learner = F.select_item(feed_forward_learner, l_act) action_q_values = self._q.forward(l_next_obs) best_action = F.argmax(action_q_values, axis=1) feed_forward_target = self._qt.forward(l_next_obs) q_target = F.select_item(feed_forward_target, best_action) terminate = F.cast(l_done, bool) l_rew = F.cast(l_rew, "float32") final_target = F.where(terminate, l_rew, l_rew + self._discount * q_target).data loss = F.mean_squared_error(final_target, q_learner) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful obs_q_value = F.select_item(self._q.forward(l_obs), l_act) target_q_value = np.zeros(l_done.shape[0]) for i in range(l_done.shape[0]): if l_done[i] == True: target_q_value[i] = l_rew[i] else: q_value_next = self._q.forward( F.expand_dims(l_next_obs[i], axis=0)) max_idx = F.argmax(q_value_next) target_value = self._qt.forward( F.expand_dims(l_next_obs[i], axis=0)) max_value = F.select_item(target_value, np.array([max_idx.data])) target_q_value[i] = l_rew[i] + self._discount * max_value.data loss = F.mean_squared_error(F.cast(target_q_value, np.float32), F.cast(obs_q_value, np.float32)) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful # loss = C.Variable(np.array([0.])) # TODO: replace this line l_rew = F.cast(l_rew, np.float32) q_future = self._q.forward(l_next_obs) qt_future = self._qt.forward(l_next_obs) future_rew = l_rew + self._discount * F.select_item( qt_future, F.argmax(q_future, axis=1)) target = F.select_item(F.stack([future_rew, l_rew], axis=1), F.cast(l_done, np.int32)) y = F.select_item(self._q.forward(l_obs), l_act) return F.mean(F.square(y - target))
def compute_G(self, in_data, out_grad_data): gy = out_grad_data[0] xp = cuda.get_array_module(gy) gy = gy.transpose(0, 2, 3, 1) # NCHW -> NHWC n, ho, wo, _ = gy.shape gy = gy.reshape(n * ho * wo, -1) gy_scale = n if self._loss_scale is not None: gy_scale *= 1.0 / self._loss_scale if self.diagonalize: if gy.dtype == xp.float16: gy = gy_scale * cast(gy, xp.float32).data else: gy = gy_scale * gy G = xp.diag((gy * gy).mean(axis=0)) else: G_scale = 1 / (n * ho * wo) * (gy_scale ** 2) if gy.dtype == xp.float16: gy = cast(gy, xp.float32).data G = gy.T.dot(gy) * G_scale diag = getattr(self.link, 'diag', False) if diag: G = xp.diag(xp.diag(G)) return G
def align_speaker(ys, ts): """Match shape as num_speaker reported can be more or less Args: ys: B-length list of predictions ts: B-length list of predictions Returns: ys: Aligned B-length list of predictions ts: Aligned B-length list of predictions """ num_speakers = [max(y.shape[1], t.shape[1]) for y, t in zip(ys, ts)] ys = [ F.pad(y, ((0, 0), (0, n_spk - y.shape[1])), 'constant', constant_values=0) for y, n_spk in zip(ys, num_speakers) ] ts = [ F.cast( F.pad(F.cast(t, 'f'), ((0, 0), (0, n_spk - t.shape[1])), 'constant', constant_values=0), 'i').array for t, n_spk in zip(ts, num_speakers) ] return ys, ts
def __call__(self, x, t): x = chainer.Variable(self.xp.asarray(x)) t = chainer.Variable(self.xp.asarray(t)) #print(x.shape) #print(t.shape) batchsize = x.data.shape[0] self.reset_state() # initial l l = np.random.uniform(-1, 1, size=(batchsize, 2)).astype(np.float32) l = chainer.Variable(self.xp.asarray(l)) sum_ln_pi = Variable((self.xp.zeros((batchsize, 1)))) sum_ln_pi = F.cast(sum_ln_pi, 'float32') l, ln_pi, y, b = self.forward( l.shape[0], x, l, first=True, ) for i in range(1, self.n_steps): l, ln_pi, y, b = self.forward(l.shape[0], x, l) ln_pi = F.cast(ln_pi, 'float32') sum_ln_pi += ln_pi y = y + 0.00000001 self.loss_action = F.softmax_cross_entropy(y, t) self.loss = self.loss_action self.accuracy = F.accuracy(y, t) reporter.report({'accuracy': self.accuracy}, self) reporter.report({'cross_entropy_loss': self.loss}, self) #print(y.shape) self.y = F.argmax(y, axis=1) #print(self.y) #self.t = F.argmax(t, axis=1) #reporter.report({'actual': t}, self) #reporter.report({'predicted': self.y}, self) if chainer.global_config.train: conditions = self.xp.argmax(y.data, axis=1) == t.data r = self.xp.where(conditions, 1., 0.).astype(self.xp.float32) r = self.xp.expand_dims(r, 1) # squared error between reward and baseline self.loss_baseline = F.mean_squared_error(r, b) self.loss += self.loss_baseline # loss with reinforce rule mean_ln_pi = sum_ln_pi / (self.n_steps - 1) a = F.sum(-mean_ln_pi * (r - b)) / batchsize self.reinforce_loss = F.sum(-mean_ln_pi * (r - b)) / batchsize self.loss += self.reinforce_loss reporter.report({'cross_entropy_loss': self.loss_action}, self) #reporter.report({'reinforce_loss': self.reinforce_loss}, self) #reporter.report({'total_loss': self.loss}, self) reporter.report({'train_accuracy': self.accuracy}, self) #print(self.loss) return self.loss
def compute_F(self, in_data, out_grad_data): x = in_data[0] gy = out_grad_data[0] ndim = len(x.shape) if ndim not in (2, 4): raise RuntimeError( 'len(x.shape) must be 2 or 4, not {}.'.format(ndim)) xp = cuda.get_array_module(x) n = x.shape[0] gy_scale = n if self._loss_scale is not None: gy_scale *= 1.0 / self._loss_scale # Re-compute BN forward with gamma=1 and beta=0 avg_mean = self.link.avg_mean _gamma = xp.ones(avg_mean.shape, dtype=x.dtype) _beta = xp.zeros(avg_mean.shape, dtype=x.dtype) h = batch_normalization(x, _gamma, _beta, eps=self.link.eps).data if ndim == 2: gy = gy_scale * gy gyh = gy * h elif ndim == 4: # data layout of gy: NCHW h = h.transpose(0, 2, 3, 1) gy = gy.transpose(0, 2, 3, 1) # data layout of gy: NHWC gy = gy * gy_scale # copy gyh = gy * h gyh = gyh.sum(axis=(1, 2)) gy = gy.sum(axis=(1, 2)) # data layout of gy: NC if self.link.beta is None: grad = gyh elif self.link.gamma is None: grad = gy else: grad = xp.hstack((gyh, gy)) if self.diagonalize: if grad.dtype == xp.float16: grad = cast(grad, xp.float32).data F = xp.diag((grad * grad).mean(axis=0)) else: F_scale = 1 / n if grad.dtype == xp.float16: grad = cast(grad, xp.float32).data F = grad.T.dot(grad) * F_scale return F
def test_forward_no_cast_grad(self): # This test would fail if F.cast does not create new function nodes for # no-op casts x = chainer.Variable(self.x) y1 = functions.cast(x, self.dtype) y2 = functions.cast(x, self.dtype) z = y1 + y2 gy1, gy2 = chainer.grad([z], [y1, y2], [numpy.ones_like(z.data)]) assert gy1.dtype == self.dtype assert gy2.dtype == self.dtype numpy.testing.assert_array_equal(gy1.data, numpy.ones_like(y1.data)) numpy.testing.assert_array_equal(gy2.data, numpy.ones_like(y2.data))
def bger(x, y): """ Batch outer product :param x: :param y: :return: """ if x.dtype == 'int' and y.dtype == 'int': x_float = F.cast(x, 'float32') y_float = F.cast(y, 'float32') res_float = F.expand_dims(x_float, 2) @ F.expand_dims(y_float, 1) return F.cast(res_float, 'int') return F.expand_dims(x, 2) @ F.expand_dims(y, 1)
def forward_chainer(self, inputs): if len(inputs) == 2: (x, w), b = inputs, None else: x, w, b = inputs if x.dtype.kind != 'f': x = F.cast(x, 'float64') if w.dtype.kind != 'f': w = F.cast(w, 'float64') if b is not None and b.dtype.kind != 'f': b = F.cast(b, 'float64') y = F.convolution_nd(x, w, b, self.stride, self.pad, self.cover_all) y = F.cast(y, self.out_dtype) return y,
def forward(self, x): """Normalize input and scale it. Args: x (chainer.Variable): A variable holding 4-dimensional array. Its :obj:`dtype` is :obj:`numpy.float32`. Returns: chainer.Variable: The shape and :obj:`dtype` are same as those of input. """ x = F.cast(x, 'float32') x = F.normalize(x, eps=self.eps, axis=1) scale = F.broadcast_to(self.scale[:, np.newaxis, np.newaxis], x.shape) return F.cast(x * scale, chainer.get_dtype())
def __call__(self, x): x = F.cast(x, "int") if self.id_trans_fn is None: words = self.embed(x) else: words = self.embed(self.id_trans_fn(x)) return words
def predict(self, imgs): """Conduct semantic segmentations from images. Args: imgs (iterable of numpy.ndarray): Arrays holding images. All images are in CHW and RGB format and the range of their values are :math:`[0, 255]`. Returns: list of numpy.ndarray: List of integer labels predicted from each image in the input \ list. """ labels = [] for img in imgs: C, H, W = img.shape with chainer.using_config('train', False), \ chainer.function.no_backprop_mode(): x = F.cast(self.xp.asarray(img[np.newaxis]), self.dtype) score = self.forward(x)[0].array.astype(np.float32) score = chainer.backends.cuda.to_cpu(score) if score.shape != (C, H, W): dtype = score.dtype score = resize(score, (H, W)).astype(dtype) label = np.argmax(score, axis=0).astype(np.int32) labels.append(label) return labels
def _forward(self, x): h = F.cast(x, self.dtype) h = self.feature_layer(h) # self.last_activation = F.sigmoid h = self.feature_layer.last_activation(self.lastconv(h)) # x.shape, h.shape = (None, 3, 224, 224), (None, 440, 7, 7) return h
def compute_features(self, obs): obs = F.cast(obs, np.float32) obs = F.transpose(obs, (0, 3, 1, 2)) h1 = F.relu(self.conv1(obs)) h2 = F.relu(self.conv2(h1)) h3 = F.relu(self.fc(h2)) return h3
def recalculate_bn_statistics(model, batchsize, dtype='float32'): print( '==> Recalculating BN statistics (batchsize={}) ...'.format(batchsize)) train = CamVidDataset(split='train') it = chainer.iterators.SerialIterator(train, batchsize, repeat=False, shuffle=False) bn_avg_mean = defaultdict(np.float32) bn_avg_var = defaultdict(np.float32) if dtype == 'mixed16': dtype = 'float16' n_iter = 0 for batch in it: imgs, _ = concat_examples(batch) model(F.cast(model.xp.array(imgs), dtype)) for name, link in model.namedlinks(): if name.endswith('_bn'): bn_avg_mean[name] += link.avg_mean bn_avg_var[name] += link.avg_var n_iter += 1 for name, link in model.namedlinks(): if name.endswith('_bn'): link.avg_mean = bn_avg_mean[name] / n_iter link.avg_var = bn_avg_var[name] / n_iter return model
def _decode_multiple(self, s, z=None, decode_num=10): if z is None: xp = chainer.backend.get_array_module(s) z = chainer.Variable( xp.random.normal(0, 1, size=(s.shape[0], decode_num, self._latent_dim))) z = F.cast(z, typ=xp.float32) z = F.clip(z, -0.5, 0.5) s = F.expand_dims(s, axis=0) s = F.repeat(s, repeats=decode_num, axis=0) s = F.transpose(s, axes=(1, 0, 2)) x = F.concat((s, z), axis=2) x = F.reshape(x, shape=(-1, x.shape[-1])) h = self._linear3(x) h = F.relu(h) h = self._linear4(h) h = F.relu(h) h = self._linear5(h) h = F.reshape(h, shape=(-1, decode_num, h.shape[-1])) return F.tanh(h), h
def __call__(self, x): heatmap = x vector_dim = 2 batch = heatmap.shape[0] channels = heatmap.shape[1] in_size = x.shape[2:] heatmap_vector = F.reshape(heatmap, shape=(batch, channels, -1)) indices = F.cast( F.expand_dims(F.argmax(heatmap_vector, axis=vector_dim), axis=vector_dim), np.float32) scores = F.max(heatmap_vector, axis=vector_dim, keepdims=True) scores_mask = (scores.array > 0.0).astype(np.float32) pts_x = (indices.array % in_size[1]) * scores_mask pts_y = (indices.array // in_size[1]) * scores_mask pts = F.concat((pts_x, pts_y, scores), axis=vector_dim).array for b in range(batch): for k in range(channels): hm = heatmap[b, k, :, :].array px = int(pts_x[b, k]) py = int(pts_y[b, k]) if (0 < px < in_size[1] - 1) and (0 < py < in_size[0] - 1): pts[b, k, 0] += np.sign(hm[py, px + 1] - hm[py, px - 1]) * 0.25 pts[b, k, 1] += np.sign(hm[py + 1, px] - hm[py - 1, px]) * 0.25 return pts
def test_forward_no_cast_variable(self): # If backprop is disabled, it's safe to simply return the input # variable for no-op casts. x = chainer.Variable(self.x) with chainer.using_config('enable_backprop', False): y = functions.cast(x, self.dtype) assert y is x
def intersect_DIFF(self, ro, rd, t0, t1): xp = chainer.backend.get_array_module(ro) BB, _, H, W = ro.shape[:4] p0 = F.broadcast_to(self.p0.reshape((1, 3, 1, 1)), (BB, 3, H, W)) p1 = F.broadcast_to(self.p1.reshape((1, 3, 1, 1)), (BB, 3, H, W)) p2 = F.broadcast_to(self.p2.reshape((1, 3, 1, 1)), (BB, 3, H, W)) fn = F.broadcast_to(self.fn.reshape((1, 3, 1, 1)), (BB, 3, H, W)) face_id = F.broadcast_to(self.id, (BB, 1, H, W)) eps = F.broadcast_to(self.eps.reshape((1, 1, 1, 1)), (BB, 1, H, W)) aa = p0 - ro A = vdot(aa, fn) B = vdot(rd, fn) B = F.where(xp.abs(B.data) < eps.data, eps, B) tx = A / B p = ro + tx * rd n01 = vcross(p0 - p, p1 - p) n12 = vcross(p1 - p, p2 - p) n20 = vcross(p2 - p, p0 - p) MASK_P = is_positive(vdot(n01, n12)) MASK_Q = is_positive(vdot(n12, n20)) MASK_R = is_positive(vdot(n20, n01)) # is_positive(F.absolute(B).reshape((B, 1, H, W))) MASK_B = is_positive(F.absolute(B)) # print(MASK_B.shape) MASK_T0 = is_positive(tx - t0) MASK_T1 = is_positive(t1 - tx) b = F.cast(MASK_P * MASK_Q * MASK_R * MASK_B * MASK_T0 * MASK_T1, 'bool') #print("MASK_B", MASK_B.shape) #print("b", b.shape) t = F.where(b, tx, t1) p = ro + t * rd #print(p.shape, p.dtype) bn = F.cast(is_positive(vdot(rd, fn)), 'bool') n = F.where(bn, -fn, fn) return {'b': b, 't': t, 'p': p, 'n': n, 'face_id': face_id}
def logli(self, a): a = F.cast(a, np.float32) # transform back to standard normal zs = (a - self.means) * F.exp(-self.log_stds) # density of standard normal: f(z) = (2*pi*det|Σ|)^(-n/2) * exp(-|x|^2/2) # the return value should be log f(z) return - F.sum(self.log_stds, axis=-1) - \ 0.5 * F.sum(F.square(zs), axis=-1) - \ 0.5 * self.means.shape[-1] * np.log(2 * np.pi)
def __call__(self, x): h = F.cast(x, np.float16) h = F.relu(self.bn2(self.conv1(h))) h = self.res3(h) h = self.res4(h) h = self.res5(h) h = self.res6(h) h = F.average_pooling_2d(h, h.shape[2:]) h = self.fc7(h) return h
def backward(self, indexes, grad_outputs): """ The gradient for the output will be scaled """ x, W = self.get_retained_inputs() gy, = grad_outputs s_gy, u_gy = self.ada_loss.loss_scaling(gy, W) # Actual gradient calculation ret = [] with chainer.using_config('use_ideep', self._config_use_ideep): if 0 in indexes: gx, = linear.LinearGradData().apply((W, s_gy)) ret.append(F.cast(gx, x.dtype)) if 1 in indexes: gW, = linear.LinearGradWeight(W.dtype).apply((x, u_gy)) ret.append(F.cast(gW, W.dtype)) if 2 in indexes: gb = chainer.functions.sum(u_gy, axis=0) ret.append(gb) return ret
def sampling(self, dist: ArrayLike, maximum=True): xp = self.xp if maximum: sampled = xp.argmax(F.softmax(dist, axis=1).data, axis=1) else: dist = F.cast(dist, xp.float64) prob = F.softmax(dist, axis=1).data sampled = xp.argmax(xp.log(prob) + xp.random.gumbel(size=prob.shape), axis=1) return sampled
def __call__(self, p, p_mask=None): xp.cuda.Device(self._device_id).use() p_len = p.shape[1] p_aug_i = F.tile(F.expand_dims(p, 2), (1, 1, p_len, 1)) p_aug_j = F.tile(F.expand_dims(p, 1), (1, p_len, 1, 1)) if p_mask is None: self_mask = None else: p_mask_aug_i = F.tile(F.expand_dims(p_mask, 2), (1, 1, p_len, 1)) p_mask_aug_i = xp.any(F.cast(p_mask_aug_i, 'bool').data, axis=3) p_mask_aug_j = F.tile(F.expand_dims(p_mask, 1), (1, p_len, 1, 1)) p_mask_aug_j = xp.any(F.cast(p_mask_aug_j, 'bool').data, axis=3) self_mask = p_mask_aug_i & p_mask_aug_j h_logits = get_logits(self.logits_linear, [p_aug_i, p_aug_j], self_mask) # ->(N,48,48) self_att = softsel(p_aug_j, h_logits) # ->(N,48,448) out = self.fuse_gate(p, self_att) return out
def compute_kfgrads(self, W, b, invs): xp = cuda.get_array_module(W.data) A_inv, G_inv = invs grad = W.grad if b is not None: grad = xp.column_stack([grad, b.grad]) out_dtype = grad.dtype if A_inv.dtype != grad.dtype: grad = cast(grad, A_inv.dtype).data kfgrad = xp.dot(xp.dot(G_inv, grad), A_inv) return kfgrad.astype(out_dtype)
def compute_A(self, in_data): x = in_data[0] xp = cuda.get_array_module(x) n, _ = x.shape if self.link.b is not None: ones = xp.ones(n, dtype=x.dtype) x = xp.column_stack((x, ones)) if x.dtype == xp.float16: x = cast(x, xp.float32).data A = (x * x).mean(axis=0) if self.diagonalize else x.T.dot(x) * (1 / n) return A
def backward(self, indexes, grad_outputs): """ The gradient for the output will be scaled """ x, W = self.get_retained_inputs() gy, = grad_outputs gy_, prev_scale = self.ada_loss.loss_scaling(gy, W) ret = [] with chainer.using_config('use_ideep', self._config_use_ideep): if 0 in indexes: gx, = linear.LinearGradData().apply((W, gy_)) self.ada_loss.set_loss_scale( gx, self.ada_loss.grad_loss_scale(gy_)) ret.append(F.cast(gx, x.dtype)) if 1 in indexes: gW, = linear.LinearGradWeight(W.dtype).apply((x, gy)) gW_ = self.ada_loss.get_unscaled_gradient(gW, prev_scale) ret.append(F.cast(gW_, W.dtype)) if 2 in indexes: gb = chainer.functions.sum(gy, axis=0) gb_ = self.ada_loss.get_unscaled_gradient(gb, prev_scale) ret.append(gb_) return ret
def test_forward_no_cast_variable(self): x = chainer.Variable(self.x) y = functions.cast(x, self.dtype) self.assertIs(y, x)
def test_forward_no_cast_array(self): y = functions.cast(self.x, self.dtype) self.assertIsInstance(y, chainer.Variable) self.assertIs(y.data, self.x)
def check_forward_no_cast(self, x_data): y = functions.cast(x_data, self.dtype) self.assertIsInstance(y, chainer.Variable) self.assertIs(y.data, x_data)
def func(x): return functions.cast(x, self.out_type)
def check_forward(self, x_data): x = chainer.Variable(x_data) y = functions.cast(x, self.out_type) self.assertEqual(y.data.shape, x.data.shape) self.assertEqual(y.data.dtype, self.out_type)
def check_forward(self, x_data): x = chainer.Variable(x_data) y = functions.cast(x, self.out_type) assert y.data.shape == x.data.shape assert y.data.dtype == self.out_type
def compute_features(self, obs): obs = F.cast(obs, np.float32) h = obs for link in self.feature_links().values(): h = self.hidden_nonlinearity(link(h)) return h
def __call__(self, x, t): return Alex.__call__(self, F.cast(x, self.dtype), t)
def __call__(self, from_tensor, to_tensor, attention_mask=None, do_return_2d_tensor=False): """ Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]. If False, the output will be of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is true, this will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]). """ def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): """ output_tensor = F.stack( F.split_axis(input_tensor, num_attention_heads, axis=1), axis=1) # batch_size * seq_length, num_attention_heads, width output_tensor = F.stack( F.split_axis(output_tensor, seq_length, axis=0), axis=2) batch_size, num_attention_heads, seq_length, width """ output_tensor = F.reshape( input_tensor, (batch_size, seq_length, num_attention_heads, width)) output_tensor = F.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = from_tensor.shape to_shape = to_tensor.shape if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: # TODO right? assert attention_mask is not None batch_size = attention_mask.shape[0] from_seq_length = attention_mask.shape[1] to_seq_length = attention_mask.shape[2] if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` from_tensor_2d = reshape_to_matrix(from_tensor) to_tensor_2d = reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] query_layer = self.query(from_tensor_2d) # `key_layer` = [B*T, N*H] key_layer = self.key(to_tensor_2d) # `value_layer` = [B*T, N*H] value_layer = self.value(to_tensor_2d) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores( query_layer, batch_size, self.num_attention_heads, from_seq_length, self.size_per_head) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores( key_layer, batch_size, self.num_attention_heads, to_seq_length, self.size_per_head) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] attention_scores = F.matmul(query_layer, key_layer, transb=True) attention_scores *= 1.0 / np.sqrt(self.size_per_head) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] # attention_mask = F.expand_dims(attention_mask, axis=1) attention_mask = attention_mask[:, None] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 adder = F.cast(1.0 - attention_mask, 'float32') * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += F.broadcast_to(adder, attention_scores.shape) # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] # (default softmax's axis is -1 in tf while 1 in chainer) # attention_probs = tf.nn.softmax(attention_scores) # tf original attention_probs = F.softmax(attention_scores, axis=3) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = F.dropout( attention_probs, self.attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] value_layer = F.reshape( value_layer, [batch_size, to_seq_length, self.num_attention_heads, self.size_per_head]) # `value_layer` = [B, N, T, H] value_layer = F.transpose(value_layer, [0, 2, 1, 3]) # `context_layer` = [B, N, F, H] context_layer = F.matmul(attention_probs, value_layer) # right? # `context_layer` = [B, F, N, H] context_layer = F.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # `context_layer` = [B*F, N*V] context_layer = F.reshape( context_layer, [batch_size * from_seq_length, self.num_attention_heads * self.size_per_head]) # right? else: # `context_layer` = [B, F, N*V] context_layer = F.reshape( context_layer, [batch_size, from_seq_length, self.num_attention_heads * self.size_per_head]) return context_layer
def forward(self, x, t): return GoogLeNetBN.forward(self, F.cast(x, self.dtype), t)
def forward(self, x, t): return Alex.forward(self, F.cast(x, self.dtype), t)
def forward(self, inputs, device): x, = inputs function = getattr(functions, self.function_name) y = function(x, axis=self.axis) y = functions.cast(y, numpy.int64) return y,
def forward(self, inputs, device): p, x, y = inputs ret = functions.linear_interpolate(p, x, y) ret = functions.cast(ret, numpy.float64) return ret,
def __call__(self, x, t): return GoogLeNetBN.__call__(self, F.cast(x, self.dtype), t)
def check_forward_no_cast(self, x_data): y = functions.cast(x_data, self.dtype) assert isinstance(y, chainer.Variable) assert y.data is x_data
def test_forward_no_cast_array(self): y = functions.cast(self.x, self.dtype) assert isinstance(y, chainer.Variable) assert y.data is self.x