def _encode(self, x_list): batch_size = len(x_list[0]) source_length = len(x_list) # Encoding fc = bc = f = b = _zeros((batch_size, self.hidden_size)) i_list = [self.x_i(_mkivar(x)) for x in x_list] f_list = [] b_list = [] for i in i_list: fc, f = F.lstm(fc, self.i_f(i) + self.f_f(f)) f_list.append(f) for i in reversed(i_list): bc, b = F.lstm(bc, self.i_b(i) + self.b_b(b)) b_list.append(b) b_list.reverse() # Making concatenated matrix # {f,b}_mat: shape = [batch, srclen, hidden] f_mat = F.concat([F.expand_dims(f, 1) for f in f_list], 1) b_mat = F.concat([F.expand_dims(b, 1) for b in b_list], 1) # fb_mat: shape = [batch, srclen, 2 * hidden] fb_mat = F.concat([f_mat, b_mat], 2) # fbe_mat: shape = [batch * srclen, atten] fbe_mat = self.fb_e( F.reshape(fb_mat, [batch_size * source_length, 2 * self.hidden_size])) return fb_mat, fbe_mat, fc, bc, f_list[-1], b_list[0]
def clipped_loss(x, t): diff = x - t abs_loss = abs(diff) squared_loss = diff ** 2 abs_loss = F.expand_dims(abs_loss, 1) squared_loss = F.expand_dims(squared_loss, 1) return F.sum(F.min(F.concat((abs_loss, squared_loss), axis=1), axis=1))
def __call__(self, x): input_shape = x.shape x = F.average_pooling_2d(x, ksize=x.shape[2:]) x = F.reshape(x, shape=(x.shape[0], -1)) x = self.init_fc(x) x = self.main_fcs(x) x = self.final_fc(x) x = F.broadcast_to(F.expand_dims(F.expand_dims(x, axis=2), axis=3), input_shape) return x
def attention_layer(self, features, features_proj, Xp): h = F.expand_dims(self.w_att(Xp), 1) features_proj = F.normalize(features_proj, axis=-1) h = F.normalize(h, axis=-1) h_att = F.relu(features_proj + F.broadcast_to(h, features_proj.shape)) # (N, self.D, self.C) + (N, 1, self.C) out_att = self.w(F.reshape(h_att, (-1, self.C))) # (Nxself.D, self.C) -> (Nxself.D, 1) out_att = F.reshape(out_att, (-1, self.D)) # (N, self.D) alpha = F.softmax(out_att) # (N, self.D) context = F.sum(features * F.broadcast_to(F.expand_dims(alpha, 1), features.shape), axis=2) # (N, self.C, self.D) * (N, 1, self.D) return context, alpha
def __call__(self, x, t, index): h = self.predict(x) self.history = np.append(self.history, np.array([np.mean(h.data, axis=0)]), axis=0) h = F.select_item(h, index) # choose the action[index] in each column error_abs = abs(h - t) error = F.concat((F.expand_dims(error_abs ** 2, 1), F.expand_dims(error_abs, 1)), axis=1) # 1 < error_abs <=> error ** 2 > error, error < 1 <=> error ** 2 < error self.loss = F.sum(F.min(error, axis=1)) / np.float32(len(error_abs)) return self.loss
def __call__(self, x): w = F.average_pooling_2d(x, ksize=x.shape[2:]) w = self.fc1(w) if self.use_conv2: w = self.activ(w) w = self.fc2(w) w = self.sigmoid(w) w = F.broadcast_to(F.expand_dims(F.expand_dims(w, axis=2), axis=3), x.shape) x = x * w return x
def pool(self, WX, skip_mask=None): Z, F, O, I = None, None, None, None # f-pooling if len(self._pooling) == 1: assert len(WX) == 2 Z, F = WX Z = functions.tanh(Z) F = self.zoneout(F) # fo-pooling if len(self._pooling) == 2: assert len(WX) == 3 Z, F, O = WX Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) # ifo-pooling if len(self._pooling) == 3: assert len(WX) == 4 Z, F, O, I = WX Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) I = functions.sigmoid(I) assert Z is not None assert F is not None T = Z.shape[2] for t in xrange(T): zt = Z[:, :, t] ft = F[:, :, t] ot = 1 if O is None else O[:, :, t] it = 1 - ft if I is None else I[:, :, t] xt = 1 if skip_mask is None else skip_mask[:, t, None] # will be used for seq2seq to skip PAD if self.ct is None: self.ct = (1 - ft) * zt * xt else: self.ct = ft * self.ct + it * zt * xt self.ht = self.ct if O is None else ot * self.ct if self.H is None: self.H = functions.expand_dims(self.ht, 2) else: self.H = functions.concat( (self.H, functions.expand_dims(self.ht, 2)), axis=2) if self._test: self.H.unchain_backward() return self.H
def __call__(self, x, enc_out=None, mask=None): """ args x: paralleled main features in the model Variable in (batch, hidden_dim, length) u: hidden features from Encoder Variable in (batch, hidden_dim, length) mask: padding-mask or future-mask xp-array in (batch, length, length) an element takes 'False' when pad/future, otherwise 'True' returns """ # ksize-1-convolution results in parallel linear projections if self.self_attention: qkv = F.squeeze(self.W(F.expand_dims(x, axis=3)), axis=3) query, key, value = F.split_axis(qkv, 3, axis=1) else: query = F.squeeze(self.W_Q(F.expand_dims(x, axis=3)), axis=3) kv = F.squeeze(self.W_KV(F.expand_dims(enc_out, axis=3)), axis=3) key, value = F.split_axis(kv, 2, axis=1) # make q,k,v into (batch*parallel, dim/parallel, length)shape query = F.concat(F.split_axis(query, self.parallel_num, axis=1), axis=0) key = F.concat(F.split_axis(key, self.parallel_num, axis=1), axis=0) value = F.concat(F.split_axis(value, self.parallel_num, axis=1), axis=0) mask = self.xp.concatenate([mask] * self.parallel_num, axis=0) attention_weight = F.batch_matmul(query, key, transa=True) * self.scale attention_weight = F.where( mask, attention_weight, self.xp.full(attention_weight.shape, -np.inf, dtype=np.float32)) attention_weight = F.softmax(attention_weight, axis=2) attention_weight = F.dropout(attention_weight, self.dropout_rate) attention_weight = F.where( self.xp.isnan(attention_weight.data), self.xp.full(attention_weight.shape, 0, dtype=np.float32), attention_weight) self.attention_weight = copy.deepcopy(attention_weight.data) # attention: (batch, q-length, k-length) -> (batch, 1, q-length, k-length) # value: (batch, dim/parallel, k-length) -> (batch, dim/parallel, 1, k-length) attention_weight, value = F.broadcast(attention_weight[:, None], value[:, :, None]) weighted_sum = F.sum(attention_weight * value, axis=3) weighted_sum = F.concat(F.split_axis(weighted_sum, self.parallel_num, axis=0), axis=1) weighted_sum = F.squeeze(self.linear( F.expand_dims(weighted_sum, axis=3)), axis=3) return weighted_sum
def kl_div(mu1, lv1, lv2): # KL Divergence between given normal and prior at N(0, sigma_2) # Prior assumes mean at zero # lns2 - lns1 + (s2^2 + (u1 - u2)**2)/ 2s2**2 - 0.5 if len(lv1.shape) == 2: lv1 = F.expand_dims(lv1, 0) mu1 = F.expand_dims(mu1, 0) lv2 = F.broadcast_to(lv2, lv1.shape) v12 = F.exp(lv1)**2.0 v22 = F.exp(lv2)**2.0 return lv2 - lv1 + .5 * v12 / v22 + .5 * mu1**2. / v22 - .5
def get_gcam(self, end_output, activations, shape, label): self.cleargrads() class_id = self.set_init_grad(end_output, label) end_output.backward(retain_grad=True) grad = activations.grad_var grad = F.average_pooling_2d(grad, (grad.shape[-2], grad.shape[-1]), 1) grad = F.expand_dims(F.reshape(grad, (grad.shape[0]*grad.shape[1], grad.shape[2], grad.shape[3])), 0) weights = activations weights = F.expand_dims(F.reshape(weights, (weights.shape[0]*weights.shape[1], weights.shape[2], weights.shape[3])), 0) gcam = F.resize_images(F.relu(F.convolution_2d(weights, grad, None, 1, 0)), shape) return gcam, class_id
def calcLoss(G, X, S): GXr = G*xp.real(X).astype(np.float32) GXi = G*xp.imag(X).astype(np.float32) Sr = xp.real(S).astype(np.float32) Si = xp.imag(S).astype(np.float32) gxL = [F.expand_dims(iDGTcf.iDGT(GXr[ii],GXi[ii],windowDG,shiftLenG,fftLenG), axis=0) for ii in range(len(G))] sL = [F.expand_dims(iDGTcf.iDGT(Sr[ii],Si[ii],windowDG,shiftLenG,fftLenG), axis=0) for ii in range(len(G))] gx = F.vstack(gxL) s = F.vstack(sL) loss = F.mean_absolute_error(gx,s) return loss
def __call__(self, x): att1 = F.average_pooling_2d(x, ksize=x.shape[2:]) att1 = self.mlp(att1) att2 = F.max_pooling_2d(x, ksize=x.shape[2:]) att2 = self.mlp(att2) att = att1 + att2 att = F.sigmoid(att) att = F.broadcast_to(F.expand_dims(F.expand_dims(att, axis=2), axis=3), x.shape) x = x * att return x
def __call__(self, X, ht_enc, H_enc, skip_mask=None, test=False): self._test = test WX = self.W(X) Vh = self.V(ht_enc) Vh, WX = functions.broadcast(functions.expand_dims(Vh, axis=2), WX) # f-pooling Z, F, O = functions.split_axis(WX + Vh, 3, axis=1) Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) T = Z.shape[2] # compute ungated hidden states self.contexts = [] for t in xrange(T): z = Z[:, :, t] f = F[:, :, t] if t == 0: ct = (1 - f) * z self.contexts.append(ct) else: ct = f * self.contexts[-1] + (1 - f) * z self.contexts.append(ct) if skip_mask is not None: assert skip_mask.shape[1] == H_enc.shape[2] softmax_getas = (skip_mask == 0) * -1e6 # compute attention weights (eq.8) H_enc = functions.swapaxes(H_enc, 1, 2) for t in xrange(T): ct = self.contexts[t] geta = 0 if skip_mask is None else softmax_getas[ ..., None] # to skip PAD mask = 1 if skip_mask is None else skip_mask[..., None] # to skip PAD alpha = functions.batch_matmul(H_enc, ct) + geta alpha = functions.softmax(alpha) * mask alpha = functions.broadcast_to(alpha, H_enc.shape) # copy kt = functions.sum(alpha * H_enc, axis=1) ot = O[:, :, t] self.ht = ot * self.o(functions.concat((kt, ct), axis=1)) if test: self.ht.unchain_backward() if t == 0: self.H = functions.expand_dims(self.ht, 2) else: self.H = functions.concat( (self.H, functions.expand_dims(self.ht, 2)), axis=2) return self.H
def __call__(self, xs, ys): eos = self.xp.array([EOS], 'i') xs = [self.denoiseInput(x[::-1], self.denoising_rate) for x in xs] # denoising #ys_d = [self.wordDropout(y, self.word_dropout) for y in ys] # word dropout ys_d = [self.denoiseInput(y, self.word_dropout) for y in ys] # word dropout ys_in = [F.concat([eos, y], axis=0) for y in ys_d] ys_out = [F.concat([y, eos], axis=0) for y in ys] # Both xs and ys_in are lists of arrays. exs = sequence_embed(self.embed_x, xs) eys = sequence_embed(self.embed_y, ys_in) batch = len(xs) # None represents a zero vector in an encoder. hx, at = self.encoder(None, exs) # layer x batch x n_units hx_t = F.transpose(hx, (1, 0, 2)) # batch x layer x n_units mu = self.W_mu(hx_t) # batch x n_latent ln_var = self.W_ln_var(hx_t) #print(mu.shape) #print(hx_t.shape) rec_loss = 0 concat_ys_out = F.concat(ys_out, axis=0) for _ in range(self.k): z = F.gaussian(mu, ln_var) z_e = F.expand_dims(z, 2) # batch x n_latent x 1 Wz = self.W_h(z_e) # batch x (layer x unit) #print('Wz: {}, {}'.format(Wz.shape, type(Wz))) hys = F.split_axis(Wz, self.n_layers, 1) # layer x batch x unit #print('hys, {}'.format([x.shape for x in hys])) c_hy = F.concat([F.expand_dims(hy, 0) for hy in hys], 0) # layer x batch x unit #print('c_hy: {}'.format(c_hy.shape)) _, os = self.decoder(c_hy, eys) #print(len(os)) concat_os = F.concat(os, axis=0) rec_loss += F.sum( F.softmax_cross_entropy(self.W(concat_os), concat_ys_out, reduce='no')) / (self.k * batch) latent_loss = self.C * F.gaussian_kl_divergence(mu, ln_var) / batch loss = rec_loss + latent_loss chainer.report({'loss': loss.data}, self) n_words = concat_ys_out.shape[0] perp = self.xp.exp(loss.data * batch / n_words) chainer.report({'perp': perp}, self) return loss
def translate(self, xs, max_length=100): xs = numpy.insert(xs, 0, 2) xs = numpy.append(xs, 0) with chainer.no_backprop_mode(), chainer.using_config('train', False): exs = self.embed_x(Variable(self.xp.array(xs, dtype=self.xp.int32))) h = F.expand_dims(exs, axis=0) h = F.expand_dims(h, axis=0) h = F.transpose(h, (0, 1, 3, 2)) for i in range(self.stack): h = self.gcnn[i](h) h = F.squeeze(h, axis=1) h = F.squeeze(h, axis=0) h = F.transpose(h, (1, 0)) ys = self.xp.full(1, 2, self.xp.int32) result = [] hx = None cx = None hx2 = None cx2 = None for i in range(max_length): eys = self.embed_y(ys) eyys = self.embed_yy(ys) eys2 = [eys] eyys2 = [eyys] hx, cx, ss = self.decoder(hx, cx, eys2) hx2, cx2, ss2 = self.decoder2(hx2, cx2, eyys2) batch_A = F.matmul(h, ss[0], transb=True) * self.scale_score batch_A = F.softmax(batch_A, axis=0) if self.weight: with open("weight/wei.txt", "a", encoding="utf-8") as f: for j in range(len(batch_A)): f.write(str(batch_A[j][0].data) + "\n") f.write("--------------\n") s = F.matmul(batch_A, h, transa=True) t = (self.We(s) + self.Ws(ss2[0])) ys = self.xp.argmax(t.data, axis=1).astype(self.xp.int32) if ys[0] == 0: break result.append(ys) result = cuda.to_cpu( self.xp.concatenate([self.xp.expand_dims(x, 0) for x in result]).T) # Remove EOS taggs outs = [] for y in result: inds = numpy.argwhere(y == EOS) if len(inds) > 0: y = y[:inds[0, 0]] outs.append(y) return outs
def __call__(self, x, hc=None): w = F.average_pooling_2d(x, ksize=x.shape[2:]) w = w.reshape((w.shape[0], -1)) if hc is None: h = [self.xp.zeros_like(w.array, dtype=w.dtype)] * self.num_layers c = [self.xp.zeros_like(w.array, dtype=w.dtype)] * self.num_layers else: h, c = hc h, c = self.lstm(w, h, c) w = F.expand_dims(F.expand_dims(h[-1], axis=-1), axis=-1) x = x * w return x, (h, c)
def loss_information(enc, x): p_logit = enc(x) p = F.sigmoid(p_logit) p_ave = F.sum(p, axis=0) / x.data.shape[0] cond_ent = F.sum(-p * F.log(p + 1e-8) - (1 - p) * F.log(1 - p + 1e-8)) / p.data.shape[0] marg_ent = F.sum(-p_ave * F.log(p_ave + 1e-8) - (1 - p_ave) * F.log(1 - p_ave + 1e-8)) p_ave = F.reshape(p_ave, (1, len(p_ave.data))) p_ave_separated = F.separate(p_ave, axis=1) p_separated = F.separate(F.expand_dims(p, axis=2), axis=1) p_ave_list_i = [] p_ave_list_j = [] p_list_i = [] p_list_j = [] for i in range(n_bit - 1): p_ave_list_i.extend(list(p_ave_separated[i + 1:])) p_list_i.extend(list(p_separated[i + 1:])) p_ave_list_j.extend([p_ave_separated[i] for n in range(n_bit - i - 1)]) p_list_j.extend([p_separated[i] for n in range(n_bit - i - 1)]) p_ave_pair_i = F.expand_dims(F.concat(tuple(p_ave_list_i), axis=0), axis=1) p_ave_pair_j = F.expand_dims(F.concat(tuple(p_ave_list_j), axis=0), axis=1) p_pair_i = F.expand_dims(F.concat(tuple(p_list_i), axis=1), axis=2) p_pair_j = F.expand_dims(F.concat(tuple(p_list_j), axis=1), axis=2) p_pair_stacked_i = F.concat( (p_pair_i, 1 - p_pair_i, p_pair_i, 1 - p_pair_i), axis=2) p_pair_stacked_j = F.concat( (p_pair_j, p_pair_j, 1 - p_pair_j, 1 - p_pair_j), axis=2) p_ave_pair_stacked_i = F.concat( (p_ave_pair_i, 1 - p_ave_pair_i, p_ave_pair_i, 1 - p_ave_pair_i), axis=1) p_ave_pair_stacked_j = F.concat( (p_ave_pair_j, p_ave_pair_j, 1 - p_ave_pair_j, 1 - p_ave_pair_j), axis=1) p_product = F.sum(p_pair_stacked_i * p_pair_stacked_j, axis=0) / len( p.data) p_ave_product = p_ave_pair_stacked_i * p_ave_pair_stacked_j pairwise_mi = 2 * F.sum(p_product * F.log( (p_product + 1e-8) / (p_ave_product + 1e-8))) return cond_ent, marg_ent, pairwise_mi
def __call__(self, h): shape = h.shape h = F.reshape(h, (shape[0], np.prod(shape[1:]))) h_ns = F.batch_l2_norm_squared(h) bs = shape[0] h0 = F.broadcast_to(F.expand_dims(h_ns, 0), (bs, bs)) h1 = F.broadcast_to(F.expand_dims(h_ns, 1), (bs, bs)) hh = F.linear(h, h) D = h0 + h1 - 2 * hh D = F.sum(D) / np.prod(h.shape) return D
def __call__(self, x): w = F.average_pooling_2d(x, ksize=x.shape[2:]) if not self.use_conv: w = F.reshape(w, shape=(w.shape[0], -1)) w = self.conv1(w) if self.use_conv else self.fc1(w) w = self.activ(w) w = self.conv2(w) if self.use_conv else self.fc2(w) w = self.sigmoid(w) if not self.use_conv: w = F.expand_dims(F.expand_dims(w, axis=2), axis=3) x = x * w return x
def compute_attention(self, query, key): """ :param query: with shape of (mb, N_1, hidden_dim) :param key: with shape of (mb, N_2, hidden_dim) :return: attn: attention weights (mb, N_1, N_2) """ energy_layer = self.energy_layer mb, N_1, hidden_dim = query.shape N_2 = key.shape[1] # # query: (mb, N_1, 1, hidden_dim) # query = functions.expand_dims(query, axis=2) # # query: (mb, N_1, N_2, hidden_dim) # query = functions.tile(query, reps=(1, 1, N_2, 1)) # # query: (mb * N_1 * N_2, hidden_dim) # query = functions.reshape(query, (mb * N_1 * N_2, hidden_dim)) # query: (mb * N_1 hidden_dim) # # key: (mb, 1, N_2, hidden_dim) # key = functions.expand_dims(key, axis=1) # # key: (mb, N_1, N_2, hidden_dim) # key = functions.tile(key, reps=(1, N_1, 1, 1)) # # key: (mb * N_1 * N_2, hidden_dim) # key = functions.reshape(key, (mb * N_1 * N_2, hidden_dim)) # key: (mb * N_2, hidden_dim) # energy: (mb * N_1 * N_2, 1) # energy = self.activation(energy_layer(key, query)) query_real, query_imag = self.fourier_transform(query) key_real, key_imag = self.fourier_transform(key) query_real = functions.reshape(functions.tile(functions.expand_dims( query_real, axis=2), reps=(1, 1, N_2, 1)), shape=(mb * N_1 * N_2, hidden_dim)) query_imag = functions.reshape(functions.tile(functions.expand_dims( query_imag, axis=2), reps=(1, 1, N_2, 1)), shape=(mb * N_1 * N_2, hidden_dim)) key_real = functions.reshape(functions.tile(functions.expand_dims( key_real, axis=1), reps=(1, N_1, 1, 1)), shape=(mb * N_1 * N_2, hidden_dim)) key_imag = functions.reshape(functions.tile(functions.expand_dims( key_imag, axis=1), reps=(1, N_1, 1, 1)), shape=(mb * N_1 * N_2, hidden_dim)) energy = self.activation( energy_layer(key_real, query_real) + energy_layer(key_imag, query_imag)) energy = functions.reshape(energy, (mb, N_1, N_2)) return energy
def __call__(self, x0, x1, l0, l1, train=True): """ Forward computation. Args: x0: Chainer variable in shape (B, T0) where B is the batch size, T is the number of tokens in each data. Each element should be given as the index of embedding. x1: Chainer variable in shape (B, T1) Returns: """ t0 = x0.shape[1] t1 = x1.shape[1] # a: (B, T0, M) a = self.emb(x0) # b: (B, T1, M) b = self.emb(x1) if not self._train_embedding: a.unchain_backward() b.unchain_backward() a = self._token_wise_linear(a, self.emb_proj, l0, train, self.xp) b = self._token_wise_linear(b, self.emb_proj, l1, train, self.xp) # Apply perceptron layer to each feature vectors ... eq. 1 # (B, Ti, M) -> (B * Ti, M) -> (B * Ti, F) -> (B, Ti, F) a_f = self._token_wise_linear(a, self.f, l0, train, self.xp) b_f = self._token_wise_linear(b, self.f, l1, train, self.xp) # for each batch, calculate a_f[b] # e: (B, T0, T1) e = F.batch_matmul(a_f, b_f, transb=True) # att_*: (B, T0, T1) att_b, att_a = self._length_aware_softmax(e, l0, l1, self.xp) # sum((B, T0, T1).(B, T0, T1, M)) -> beta: (B, T0, M) ... eq. 2 b_tiled = F.tile(F.expand_dims(b, 1), (1, t0, 1, 1)) att_b = F.expand_dims(att_b, 3) beta = F.sum(F.broadcast_to(att_b, b_tiled.shape) * b_tiled, axis=2) # sum((B, T0, T1).(N, T0, T1, M)) -> beta: (B, T1, M) ... eq. 2 a_tiled = F.tile(F.expand_dims(a, 2), (1, 1, t1, 1)) att_a = F.expand_dims(att_a, 3) alpha = F.sum(F.broadcast_to(att_a, a_tiled.shape) * a_tiled, axis=1) # Make comparison, [(B, Ti, M), (B, Ti, M)] -> (B, M') v1 = self._compare(a, beta, l0, train, self.xp) v2 = self._compare(b, alpha, l1, train, self.xp) # (B, M' + M') -> (B, n_class) ... eq. 4 & 5 v = F.concat((v1, v2), axis=1) y = self.h(v) return y
def mixture_of_discretized_logistics_nll(x, y): """ Args: x: (b, c, n, n) y: (b, 10*n_mix, n, n) """ xp = get_array_module(x) n_mix = y.shape[1] // 10 logit_prob = y[:, :n_mix, :, :] y = F.reshape(y[:, n_mix:, :, :], x.shape + (n_mix * 3, )) mean = y[:, :, :, :, 0:n_mix] log_scale = y[:, :, :, :, n_mix:2 * n_mix] log_scale = F.maximum(log_scale, -7 * xp.ones(log_scale.shape, dtype='f')) coeff = F.tanh(y[:, :, :, :, 2 * n_mix:3 * n_mix]) x = xp.repeat(xp.expand_dims(x, 4), n_mix, 4) m1 = F.expand_dims(mean[:, 0, :, :, :], 1) m2 = F.expand_dims( mean[:, 1, :, :, :] + coeff[:, 0, :, :, :] * x[:, 0, :, :, :], 1) m3 = F.expand_dims( (mean[:, 2, :, :, :] + coeff[:, 1, :, :, :] * x[:, 0, :, :, :] + coeff[:, 2, :, :, :] * x[:, 1, :, :, :]), 1) mean = F.concat([m1, m2, m3]) centered_x = x - mean inv_std = F.exp(-log_scale) max_in = inv_std * (centered_x + 1. / 255.) cdf_max = F.sigmoid(max_in) min_in = inv_std * (centered_x - 1. / 255.) cdf_min = F.sigmoid(min_in) log_cdf_max = max_in - F.softplus(max_in) # 0 log_one_minus_cdf_min = -F.softplus(min_in) # 255 cdf_delta = cdf_max - cdf_min # 0 ~ 255 mid_in = inv_std * centered_x log_pdf_mid = mid_in - log_scale - 2. * F.softplus(mid_in) # mid log_prob = F.where( x < -0.999, log_cdf_max, F.where( x > 0.999, log_one_minus_cdf_min, F.where( cdf_delta.array > 1e-5, F.log( F.maximum(cdf_delta, xp.ones(cdf_delta.shape, dtype='f') * 1e-12)), log_pdf_mid - xp.log(127.5)))) log_prob = F.transpose(F.sum(log_prob, 1), (0, 3, 1, 2)) log_prob = log_prob + log_prob_from_logit(logit_prob) loss = F.logsumexp(log_prob, 1) loss = F.sum(loss, axis=(1, 2)) return -F.mean(loss)
def __call__(self): """Applies the linear layer. Args: x (~chainer.Variable): Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ norm = F.batch_l2_norm_squared(self.W)**0.5 norm_broadcasted = F.broadcast_to(F.expand_dims(norm, 1), self.W.data.shape) g_broadcasted = F.broadcast_to(F.expand_dims(self.g, 1), self.W.data.shape) return g_broadcasted * self.W / norm_broadcasted
def __call__(self, edge, node, triplet): num_atom = edge.shape[1] hn1 = F.tile(F.expand_dims(self.Wn1(node), 1), (1, num_atom, 1, 1)) hn2 = F.tile(F.expand_dims(self.Wn1(node), 2), (1, 1, num_atom, 1)) ht1 = self.Wt2(F.sum(zero_plus(self.Wt1(triplet)), axis=1)) concat = F.concat([hn1, hn2, ht1, edge], axis=3) add = zero_plus(self.We2(zero_plus(self.We1(concat)))) return edge + self.bn(add)
def bger(x, y): """ Batch outer product :param x: :param y: :return: """ if x.dtype == 'int' and y.dtype == 'int': x_float = F.cast(x, 'float32') y_float = F.cast(y, 'float32') res_float = F.expand_dims(x_float, 2) @ F.expand_dims(y_float, 1) return F.cast(res_float, 'int') return F.expand_dims(x, 2) @ F.expand_dims(y, 1)
def scoreDot(self, atts, ys): xs = [self.W3(att) for att in atts] xs_T = [F.transpose(x, (1, 0))for x in xs] dots = [F.matmul(y, x)for x, y in zip(xs_T, ys)] aws = [F.softmax(dot, 1) for dot in dots] cts = [] for x, aw in zip(xs, aws): # split batch aw = F.expand_dims(aw, 1) x = F.tile(F.expand_dims(x, 0), (aw.shape[0], 1, 1)) ct = F.batch_matmul(aw, x) cts.append(F.reshape(ct, (ct.shape[0], ct.shape[2]))) ds = [F.tanh(self.Wc1(ct) + self.Wc2(y)) for y, ct in zip(ys, cts)] return ds
def __call__(self, xs): xs = chainer.dataset.convert.concat_examples(xs, padding=0) xs = F.transpose(xs,(1,0,2)) for i in range(len(xs)): if i==0: hs = self.l1(xs[0]) hs = F.expand_dims(hs,0) else: hw = self.l1(xs[i]) hw = F.expand_dims(hw,0) hs = F.concat((hs,hw), axis=0) h=F.mean(hs,axis=0) return h
def __call__(self,input_data,hx=None): if np.any(hx): hx = hx.reshape(1,-1,self.h1.out_size) input_x = [Variable(x) for x in input_data] hx,cx,y = self.h1(hx,None,input_x) y2 = [F.concat(x, axis=0) for x in F.pad_sequence(y,length=17, padding=0.)] y2 = F.concat([F.expand_dims(x,axis=0) for x in y2],axis=0) out = self.hy(F.concat([F.expand_dims(item[-1],axis=0) for item in y],axis=0)) atn = self.atn(y2) return F.concat([F.expand_dims(a*o,axis=0) for a,o in zip(atn,out)],axis=0)
def __call__(self, x, pid): x = self.bn(x) x = F.swapaxes(x, axis1=1, axis2=3) y = F.expand_dims(F.expand_dims(pid, axis=-1), axis=-1) y = F.tile(y, reps=(1, 1, self.audio_window_size, 1)) x = F.concat((x, y), axis=1) x = self.branch(x) x = F.reshape(x, shape=(x.shape[0], -1)) x = F.concat((x, pid), axis=1) x = self.fc1(x) x = F.tanh(x) x = self.fc2(x) return x
def __call__(self, x): """Applies the linear layer. Args: x (~chainer.Variable): Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ norm = F.batch_l2_norm_squared(self.W) ** 0.5 norm_broadcasted = F.broadcast_to( F.expand_dims(norm, 1), self.W.data.shape) g_broadcasted = F.broadcast_to( F.expand_dims(self.g, 1), self.W.data.shape) return F.linear(x, g_broadcasted * self.W / norm_broadcasted, self.b)
def __call__(self, imgs, questions): feat = self.feat_extractor(imgs) # Append relative coordinates to each location in the feature maps. n, c, h, w = feat.shape spatial_area = h * w xp = self.xp coords_h = xp.linspace(-1, 1, h, dtype=feat.dtype) coords_w = xp.linspace(-1, 1, w, dtype=feat.dtype) coords_hh, coords_ww = xp.meshgrid(coords_h, coords_w) coords_hh = coords_hh[None] coords_ww = coords_ww[None] coords = xp.concatenate((coords_hh, coords_ww), axis=0) coords = coords.reshape(2, -1) coords = coords[None] # (1, 2, spatial_area * spatial_area) coords = xp.repeat(coords, n, axis=0) # Coordinates may be cached here but the performance gain is not # significant so it is skipped in favor of readability. feat = feat.reshape(n, c, spatial_area) h = F.concat((feat, coords), axis=1) # (n, c + 2, spatial_area) # Create coordinate pairs (differentiable meshgrid). h_hh = F.expand_dims(h, 2) h_ww = F.expand_dims(h, 3) h_hh = F.repeat(h_hh, spatial_area, axis=2) h_ww = F.repeat(h_ww, spatial_area, axis=3) h = F.concat((h_hh, h_ww), axis=1) # Append questions to each coordinate pair. questions = questions.astype(imgs.dtype) questions = questions[:, :, None, None] questions = F.tile(questions, (1, 1, spatial_area, spatial_area)) h = F.concat((h, questions), axis=1) # (n, (c + 2) * 2 + questions_length, spatial_area, spatial_area) # g. h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (n * spatial_area * spatial_area, -1)) h = self.g(h) h = F.reshape(h, (n, spatial_area * spatial_area, -1)) h = F.sum(h, axis=1) h = self.f(h) # Logits. h = self.fc(h) return h
def __call__(self, imgs, questions): feat = self.feat_extractor(imgs) # Append relative coordinates to each location in the feature maps. n, c, h, w = feat.shape spatial_area = h * w xp = self.xp coords_h = xp.linspace(-1, 1, h, dtype=feat.dtype) coords_w = xp.linspace(-1, 1, w, dtype=feat.dtype) coords_hh, coords_ww = xp.meshgrid(coords_h, coords_w) coords_hh = coords_hh[None] coords_ww = coords_ww[None] coords = xp.concatenate((coords_hh, coords_ww), axis=0) coords = coords.reshape(2, -1) coords = coords[None] # (1, 2, spatial_area * spatial_area) coords = xp.repeat(coords, n, axis=0) # Coordinates may be cached here but the performance gain is not # significant so it is skipped in favor of readability. feat = feat.reshape(n, c, spatial_area) h = F.concat((feat, coords), axis=1) # (n, c + 2, spatial_area) # Create coordinate pairs (differentiable meshgrid). h_hh = F.expand_dims(h, 2) h_ww = F.expand_dims(h, 3) h_hh = F.repeat(h_hh, spatial_area, axis=2) h_ww = F.repeat(h_ww, spatial_area, axis=3) h = F.concat((h_hh, h_ww), axis=1) # Append questions to each coordinate pair. questions = questions.astype(imgs.dtype) questions = questions[:, :, None, None] questions = F.tile(questions, (1, 1, spatial_area, spatial_area)) h = F.concat((h, questions), axis=1) # (n, (c + 2) * 2 + questions_length, spatial_area, spatial_area) # g. h = F.transpose(h, (0, 2, 3, 1)) h = F.reshape(h, (n * spatial_area * spatial_area, -1)) h = self.g(h) h = F.reshape(h, (n, spatial_area * spatial_area, -1)) h = F.sum(h, axis=1) h = self.f(h) # Logits. h = self.fc(h) return h
def forward_rnn_encode_proj(self, X): # Reset rnn state self.reset_rnn_state() # Get input shape in_size, batch_size, in_dim = X.shape enc_states = X for currL in range(len(self.rnn_enc)): for i in range(in_size): temp_f = F.expand_dims( F.dropout(self[self.rnn_enc[currL]](enc_states[i]), ratio=self.cfg["dropout"]["rnn"]), 0) # if bi-directional if self.bi_rnn: temp_r = F.expand_dims( F.dropout(self[self.rnn_rev_enc[currL]]( enc_states[-1]), ratio=self.cfg["dropout"]["rnn"]), 0) if i > 0: h_fwd = F.concat((h_fwd, temp_f), axis=0) if self.bi_rnn: h_rev = F.concat((h_rev, temp_r), axis=0) else: h_fwd = temp_f if self.bi_rnn: h_rev = temp_r # end current rnn layer if self.bi_rnn: h_rev = F.flipud(h_rev) rnn_states = F.concat((h_fwd, h_rev), axis=2) else: rnn_states = h_fwd """ Apply linear projection """ # print(f"Applying rnn {currL}") if currL < (len(self.rnn_enc) - 1): # print(f"Applying linear linear_proj {currL}") for i in range(0, in_size): currH = F.relu(self[f"enc_proj{currL}_bn"]( self[f"enc_proj{currL}"](rnn_states[i]))) if i > 0: enc_states = F.concat( (enc_states, F.expand_dims(currH, 0)), axis=0) else: enc_states = F.expand_dims(currH, 0) # end for all hidden states # end all layers # Make the batch size as the first dimension self.enc_states = F.swapaxes(enc_states, 0, 1)
def __call__(self, xi): hc0 = F.leaky_relu(self.c0(xi)) hc1 = F.leaky_relu(self.bnc1(self.c1(hc0), test=not self.train)) hc2 = F.leaky_relu(self.bnc2(self.c2(hc1), test=not self.train)) hc3 = F.leaky_relu(self.bnc3(self.c3(hc2), test=not self.train)) hc4 = F.leaky_relu(self.bnc4(self.c4(hc3), test=not self.train)) hc5 = F.leaky_relu(self.bnc5(self.c5(hc4), test=not self.train)) hc6 = F.leaky_relu(self.bnc6(self.c6(hc5), test=not self.train)) hc7 = F.leaky_relu(self.bnc7(self.c7(hc6), test=not self.train)) hc8 = F.leaky_relu(self.bnc8(self.c8(hc7), test=not self.train)) h = F.expand_dims(hc8,2) h = F.relu(F.dropout(self.bndc00(self.dc00(h), test=not self.train), 0.5, train=self.train_dropout)) hc7 = F.expand_dims(hc7,2) hc7 = F.broadcast_to(hc7, hc7.data.shape[:2]+(h.data.shape[2],)+hc7.data.shape[3:]) h = F.concat((h,hc7),1) h = F.relu(F.dropout(self.bndc0(self.dc0(h), test=not self.train), 0.5, train=self.train_dropout)) hc6 = F.expand_dims(hc6,2) hc6 = F.broadcast_to(hc6, hc6.data.shape[:2]+(h.data.shape[2],)+hc6.data.shape[3:]) h = F.concat((h,hc6),1) h = F.relu(F.dropout(self.bndc1(self.dc1(h), test=not self.train), 0.5, train=self.train_dropout)) hc5 = F.expand_dims(hc5,2) hc5 = F.broadcast_to(hc5, hc5.data.shape[:2]+(h.data.shape[2],)+hc5.data.shape[3:]) h = F.concat((h,hc5),1) h = F.relu(self.bndc2(self.dc2(h), test=not self.train)) hc4 = F.expand_dims(hc4,2) hc4 = F.broadcast_to(hc4, hc4.data.shape[:2]+(h.data.shape[2],)+hc4.data.shape[3:]) h = F.concat((h,hc4),1) h = F.relu(self.bndc3(self.dc3(h), test=not self.train)) hc3 = F.expand_dims(hc3,2) hc3 = F.broadcast_to(hc3, hc3.data.shape[:2]+(h.data.shape[2],)+hc3.data.shape[3:]) h = F.concat((h,hc3),1) h = F.relu(self.bndc4(self.dc4(h), test=not self.train)) hc2 = F.expand_dims(hc2,2) hc2 = F.broadcast_to(hc2, hc2.data.shape[:2]+(h.data.shape[2],)+hc2.data.shape[3:]) h = F.concat((h,hc2),1) h = F.relu(self.bndc5(self.dc5(h), test=not self.train)) hc1 = F.expand_dims(hc1,2) hc1 = F.broadcast_to(hc1, hc1.data.shape[:2]+(h.data.shape[2],)+hc1.data.shape[3:]) h = F.concat((h,hc1),1) h = F.relu(self.bndc6(self.dc6(h), test=not self.train)) hc0 = F.expand_dims(hc0,2) hc0 = F.broadcast_to(hc0, hc0.data.shape[:2]+(h.data.shape[2],)+hc0.data.shape[3:]) h = F.concat((h,hc0),1) h = self.dc7(h) xi_ = F.expand_dims(xi,2) xi_ = F.broadcast_to(xi_, h.data.shape) h = F.sigmoid(h+xi_) return h
def forward_one_step(self, X, ht_enc, H_enc, skip_mask): pad = self._kernel_size - 1 WX = self.W(X)[:, :, -pad - 1, None] Vh = self.V(ht_enc) Vh, WX = functions.broadcast(functions.expand_dims(Vh, axis=2), WX) # f-pooling Z, F, O = functions.split_axis(WX + Vh, 3, axis=1) Z = functions.tanh(Z) F = self.zoneout(F) O = functions.sigmoid(O) T = Z.shape[2] # compute ungated hidden states for t in xrange(T): z = Z[..., t] f = F[..., t] if self.contexts is None: ct = (1 - f) * z self.contexts = [ct] else: ct = f * self.contexts[-1] + (1 - f) * z self.contexts.append(ct) if skip_mask is not None: assert skip_mask.shape[1] == H_enc.shape[2] softmax_bias = (skip_mask == 0) * -1e6 # compute attention weights (eq.8) H_enc = functions.swapaxes(H_enc, 1, 2) for t in xrange(T): ct = self.contexts[t - T] bias = 0 if skip_mask is None else softmax_bias[ ..., None] # to skip PAD mask = 1 if skip_mask is None else skip_mask[..., None] # to skip PAD alpha = functions.batch_matmul(H_enc, ct) + bias alpha = functions.softmax(alpha) * mask alpha = functions.broadcast_to(alpha, H_enc.shape) # copy kt = functions.sum(alpha * H_enc, axis=1) ot = O[..., t] self.ht = ot * self.o(functions.concat((kt, ct), axis=1)) if self.H is None: self.H = functions.expand_dims(self.ht, 2) else: self.H = functions.concat( (self.H, functions.expand_dims(self.ht, 2)), axis=2) return self.H
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) u = x_g_norm - 2 * x_g_y_g+ y_g_norm print(np.min(u.data)) print(len((np.where(u.data < 0)[0])), np.prod(u.data.shape)) time.sleep(0.5) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def create_encoder_states_matrix(self, hs): batch_size, dim = hs[0].data.shape hs_3d = list(map(lambda h: F.expand_dims(h, 1), hs)) # [(batch_size, 1, dim)] hs_3d_concat = F.concat(hs_3d, axis=1) # (batch_size, input_length, dim) hs_3d_concat_linear = self.decoder.phi2_linear(F.reshape(hs_3d_concat, (-1, dim))) # (batch_size * input_length, dim) hs_3d_concat_linear_tanh = F.tanh(F.reshape(hs_3d_concat_linear, (batch_size, -1, dim))) # (batch_size, input_length, dim) return hs_3d_concat_linear_tanh
def check_forward(self, x_data): x = chainer.Variable(x_data) y = functions.expand_dims(x, self.axis) self.assertEqual(y.data.shape, self.out_shape) y_expect = numpy.expand_dims(cuda.to_cpu(x_data), self.axis) self.assertEqual(y.data.dtype, self.dtype) numpy.testing.assert_array_equal(cuda.to_cpu(y.data), y_expect)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = F.broadcast_to( F.gaussian( np.array([0], dtype=np.float32), np.array([np.exp(1)], dtype=np.float32)), x.shape) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def ordinal_loss(y, mask): xp = cuda.get_array_module(y.data) volatile = y.volatile b, c, n = y.data.shape max_y = F.broadcast_to(F.max(y, axis=1, keepdims=True), y.data.shape) y = y - max_y sum_y = F.broadcast_to(F.expand_dims(F.sum(y, axis=1), 1), y.data.shape) down_tri = np.tri(c, dtype=np.float32) up_tri = down_tri.T w1 = Variable(xp.asarray(down_tri.reshape(c, c, 1, 1)), volatile=volatile) w2 = Variable(xp.asarray(up_tri.reshape(c, c, 1, 1)), volatile=volatile) h = F.exp(F.expand_dims(y, -1)) h1 = F.convolution_2d(h, w1) h1 = F.convolution_2d(F.log(h1), w1) h2 = F.convolution_2d(h, w2) h2 = F.convolution_2d(F.log(h2), w2) h = F.reshape(h1 + h2, (b, c, n)) return F.sum((h - sum_y - y) * mask) / b
def __call__(self, embeded_x, m_prev, h_prev, x): batch_size = embeded_x.shape[0] lstm_in = self.W(embeded_x) + self.U(h_prev) m_tmp, h_tmp = F.lstm(m_prev, lstm_in) # flags if feeding previous output feed_prev = F.broadcast_to(F.expand_dims(x.data != IGNORE_LABEL, -1), (batch_size, self.hidden_size)) m = F.where(feed_prev, m_tmp, m_prev) h = F.where(feed_prev, h_tmp, h_prev) return m, h
def check_backward(self, x_data, y_grad): x = chainer.Variable(x_data) y = functions.expand_dims(x, self.axis) y.grad = y_grad y.backward() func = y.creator f = lambda: func.forward((x_data,)) gx, = gradient_check.numerical_grad(f, (x_data,), (y_grad,)) gradient_check.assert_allclose(cuda.to_cpu(x.grad), cuda.to_cpu(gx))
def __call__(self, x1,x2): """Applies the linear layer. Args: x (~chainer.Variable): Batch of input vectors. Returns: ~chainer.Variable: Output of the linear layer. """ batch_size = x.data.shape[0] #print batch_size batch_W = F.concat([F.expand_dims(self.W,0)] * batch_size,0) #print batch_W.data.shape return F.reshape(F.batch_matmul(x, batch_W),x.data.shape[:-1])
def __call__(self, h, h_gen=None, test=False): # Concat if h_gen is None: h = h elif h_gen is not None: # Restrict Decoder with input image h_stacked = () for i in range(h_gen.shape[1]): if np.random.randint(2) == 0: h_stacked += (F.expand_dims(h[:, i, :, :], axis=1), ) else: h_stacked += (F.expand_dims(h_gen[:, i, :, :], axis=1), ) h = F.concat(h_stacked) h = self.deconv0(h) # 7x7 -> 14x14 h = self.bn0(h, test) h = self.act(h) h = self.deconv1(h) # 14x14 -> 28x28 h = F.tanh(h) return h
def setUp(self): self.x1 = numpy.random.uniform( .5, 1, (batch_size, m, k)).astype(numpy.float32) self.x2 = numpy.random.uniform( .5, 1, (k, n)).astype(numpy.float32) self.gy = numpy.random.uniform( -1, 1, (batch_size, m, n)).astype(numpy.float32) self.op = lambda x, y: F.batch_matmul( x, F.broadcast_to(F.expand_dims(y, 0), (batch_size, k, n))) self.forward_answer = numpy.array([ numpy.dot(self.x1[i], self.x2) for i in six.moves.range(batch_size)])
def batch_rodrigues(theta): """ Theta is N x 3 """ batch_size = theta.shape[0] xp = theta.xp angle = F.expand_dims(F.sqrt(F.batch_l2_norm_squared(theta + 1e-8)), -1) r = F.expand_dims(theta / F.tile(angle, 3), -1) angle = F.expand_dims(angle, -1) cos = F.cos(angle) sin = F.sin(angle) cos = F.tile(cos, (3, 3)) sin = F.tile(sin, (3, 3)) outer = F.matmul(r, r, transb=True) eyes = F.tile(F.expand_dims( Variable(xp.array(xp.eye(3), 'f')), 0), (batch_size, 1, 1)) R = cos * eyes + (1 - cos) * outer + sin * batch_skew(r, batch_size) return R
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = self.gamma ** 2 z = F.expand_dims((x - y) ** 2, axis=0) o = F.exp(- F.linear(z, g)) return o
def __call__(self, y, m_prev, s_prev, h_forward, h_backword, enable, disable_value): # m is memory cell of lstm, s is previous hidden output # calculate attention c = self._attention(h_forward, h_backword, s_prev, enable, disable_value) # decode once embeded_y = self.E(y) batch_size = y.shape[0] lstm_in = self.W(embeded_y) + self.U(s_prev) + self.C(c) m_tmp, s_tmp = F.lstm(m_prev, lstm_in) feed_prev = F.broadcast_to(F.expand_dims(y.data != IGNORE_LABEL, -1), (batch_size, self.hidden_size)) m = F.where(feed_prev, m_tmp, m_prev) s = F.where(feed_prev, s_tmp, s_prev) t = self.U_o(s) + self.V_o(embeded_y) + self.C_o(c) return self.W_o(t), m, s
def __call__(self, x_u_0, x_u_1): """ Parameters ----------------- x_u_0: Variable Feature of unlabeled samples. x_u_1: Variable Feature of unlabeled samples. """ ffnn_u_0 = self.layers["ffnn_u_0"] ffnn_u_1 = self.layers["ffnn_u_1"] f_0 = F.softmax(ffnn_u_0(x_u_0)) f_1 = F.softmax(ffnn_u_1(x_u_1)) mid_outputs_0 = ffnn_u_0.mid_outputs mid_outputs_1 = ffnn_u_1.mid_outputs L = len(self.dims[1:]) similarities = self.similarities.values() # Efficient computation ## sample similarity W^l summed over l W = 0 for l in range(L): W += similarities[l](mid_outputs_0[l], mid_outputs_1[l]) ## class similarity f_0_norm = F.sum(f_0**2, axis=1) f_1_norm = F.sum(f_1**2, axis=1) f_0_f_1 = F.linear(f_0, f_1) f_0_norm, f_0_f_1, f_1_norm= \ F.broadcast( *[f_0_norm, f_0_f_1, F.expand_dims(f_1_norm, 1)]) F_ = f_0_norm - 2 * f_0_f_1 + f_1_norm print(np.max(F_.data)) print(np.min(F_.data)) print(len((np.where(F_.data < 0)[0])), np.prod(F_.data.shape)) loss = F.sum(W * F_) / (self.batch_size * 2) self.loss = loss return loss
def _context(self, p, fb_mat, fbe_mat): batch_size, source_length, _ = fb_mat.data.shape # {pe,e}_mat: shape = [batch * srclen, atten] pe_mat = F.reshape( F.broadcast_to( F.expand_dims(self.p_e(p), 1), [batch_size, source_length, self.atten_size]), [batch_size * source_length, self.atten_size]) e_mat = F.tanh(fbe_mat + pe_mat) # a_mat: shape = [batch, srclen] a_mat = F.softmax(F.reshape(self.e_a(e_mat), [batch_size, source_length])) # q: shape = [batch, 2 * hidden] q = F.reshape( F.batch_matmul(a_mat, fb_mat, transa=True), [batch_size, 2 * self.hidden_size]) return q
def _attention(self, h_forward, h_backword, s, enable, disable_value): batch_size = s.shape[0] sentence_size = len(h_forward) hidden_size = self.hidden_size xp = self.xp weighted_s = F.broadcast_to(F.expand_dims(self.W_a(s), axis=1), (batch_size, sentence_size, hidden_size)) h = F.concat((F.concat(h_forward, axis=0), F.concat(h_backword, axis=0))) weighted_h = F.reshape(self.U_a(h), (batch_size, sentence_size, hidden_size)) e = self.v_a(F.reshape(F.tanh(weighted_s + weighted_h), (batch_size * sentence_size, hidden_size))) e = F.where(enable, F.reshape(e, (batch_size, sentence_size)), disable_value) alpha = F.softmax(e) c = F.batch_matmul(F.reshape(h, (batch_size, 2 * hidden_size, sentence_size)), alpha) return F.reshape(c, (batch_size, 2 * hidden_size))
def proportions(self, doc_ids, softmax=False): """ Given an array of document indices, return a vector for each document of just the unnormalized topic weights. Returns: doc_weights : chainer.Variable Two dimensional topic weights of each document. """ w = self.weights(doc_ids) if softmax: size = w.data.shape mask = self.xp.random.random_integers(0, 1, size=size) y = (F.softmax(w * self.temperature) * Variable(mask.astype('float32'))) norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y) return y / (norm + 1e-7) else: return w
def __call__(self, x): xp = chainer.cuda.get_array_module(x.data) batchsize = x.shape[0] if self.train_weights == False and self.initial_T is not None: self.T.W.data = self.initial_T M = F.reshape(self.T(x), (-1, self.num_kernels, self.ndim_kernel)) M = F.expand_dims(M, 3) M_T = F.transpose(M, (3, 1, 2, 0)) M, M_T = F.broadcast(M, M_T) norm = F.sum(abs(M - M_T), axis=2) eraser = F.broadcast_to(xp.eye(batchsize, dtype=x.dtype).reshape((batchsize, 1, batchsize)), norm.shape) c_b = F.exp(-(norm + 1e6 * eraser)) o_b = F.sum(c_b, axis=2) if self.train_weights == False: self.initial_T = self.T.W.data return F.concat((x, o_b), axis=1)
def _calc_rpn_loss_bbox(self, rpn_bbox_pred, bbox_reg_targets, inds_inside): # rpn_bbox_pred has the shape of (1, 4 x n_anchors, feat_h, feat_w) n_anchors = self.proposal_layer._num_anchors # Reshape it into (4, A, K) rpn_bbox_pred = rpn_bbox_pred.reshape(4, n_anchors, -1) # Transpose it into (K, A, 4) rpn_bbox_pred = rpn_bbox_pred.transpose(2, 1, 0) # Reshape it into (K x A, 4) rpn_bbox_pred = rpn_bbox_pred.reshape(-1, 4) # Keep the number of bbox n_bbox = rpn_bbox_pred.shape[0] # Select bbox and ravel it rpn_bbox_pred = F.flatten(rpn_bbox_pred[inds_inside]) # Create batch dimension rpn_bbox_pred = F.expand_dims(rpn_bbox_pred, 0) # Ravel the targets and create batch dimension bbox_reg_targets = bbox_reg_targets.ravel()[None, :] # Calc Smooth L1 Loss (When delta=1, huber loss is SmoothL1Loss) rpn_loss_bbox = F.huber_loss(rpn_bbox_pred, bbox_reg_targets, self._delta) rpn_loss_bbox /= n_bbox return rpn_loss_bbox.reshape(())
def __call__(self, beta, theta, get_skin=False, with_a=False): batch_size = beta.shape[0] # 1. Add shape blend shapes # (N x 10) x (10 x 6890*3) = N x 6890 x 3 self.beta_shapedirs = F.matmul(beta, self.shapedirs) v_shaped = F.reshape( F.matmul(beta, self.shapedirs), [-1, self.size[0], self.size[1]]) + \ F.repeat(self.v_template[None, ], batch_size, axis=0) self.v_shaped = v_shaped # 2. Infer shape-dependent joint locations. Jx = F.matmul(v_shaped[:, :, 0], self.J_regressor) Jy = F.matmul(v_shaped[:, :, 1], self.J_regressor) Jz = F.matmul(v_shaped[:, :, 2], self.J_regressor) J = F.stack([Jx, Jy, Jz], axis=2) self.J = J # 3. Add pose blend shapes # N x 24 x 3 x 3 Rs = F.reshape( batch_rodrigues(F.reshape(theta, [-1, 3])), [-1, 24, 3, 3]) self.Rs = Rs # Ignore global rotation. pose_feature = F.reshape(Rs[:, 1:, :, :] - F.repeat(F.repeat(Variable(self.xp.array(self.xp.eye(3), 'f'))[ None, ], 23, axis=0)[None, ], batch_size, axis=0), [-1, 207]) self.pose_feature = pose_feature # (N x 207) x (207, 20670) -> N x 6890 x 3 v_posed = F.reshape( F.matmul(pose_feature, self.posedirs), [-1, self.size[0], self.size[1]]) + v_shaped # 4. Get the global joint location self.J_transformed, A = batch_global_rigid_transformation( Rs, J, self.parents) # 5. Do skinning: # W is N x 6890 x 24 W = F.reshape( F.tile(self.weights, (batch_size, 1)), [batch_size, -1, 24]) # (N x 6890 x 24) x (N x 24 x 16) T = F.reshape( F.matmul(W, F.reshape(A, [batch_size, 24, 16])), [batch_size, -1, 4, 4]) v_posed_homo = F.concat( [v_posed, self.xp.ones([batch_size, v_posed.shape[1], 1], 'f')], 2) v_homo = F.matmul(T, F.expand_dims(v_posed_homo, -1)) verts = v_homo[:, :, :3, 0] # Get cocoplus or lsp joints: joint_x = F.matmul(verts[:, :, 0], self.joint_regressor) joint_y = F.matmul(verts[:, :, 1], self.joint_regressor) joint_z = F.matmul(verts[:, :, 2], self.joint_regressor) joints = F.stack([joint_x, joint_y, joint_z], axis=2) return verts, joints, Rs, A
def logsoftmax_no_mask(x, mask, zero_pad, axis): x_logsumexp = logsumexp(x, mask, zero_pad, axis) return x - F.broadcast_to(F.expand_dims(x_logsumexp, 1), x.shape)
def test_invalid_dim(self): x = chainer.Variable(self.x) with self.assertRaises(chainer.utils.type_check.InvalidType): functions.expand_dims(x, self.x.ndim + 1) with self.assertRaises(chainer.utils.type_check.InvalidType): functions.expand_dims(x, -self.x.ndim - 2)
def __call__(self, h, h_new, g, step=0): """ Describes the module for a single layer update. Do not forget to rest GRU for each batch... :param h: minibatch by num_nodes by hidden_dim numpy array. current local node hidden states as input of the vanilla GNN :param h_new: minibatch by num_nodes by hidden_dim numpy array. updated local node hidden states as output from the vanilla GNN :param adj: minibatch by bond_types by num_nodes by num_nodes 1/0 array. Adjacency matrices over several bond types :param g: minibatch by hidden_dim_super numpy array. current super node hiddden state :param step: integer, the layer index :return: updated h and g """ xp = self.xp # (minibatch, atom, ch) mb, atom, ch = h.shape out_ch = ch # # Transmitter unit: inter-module message passing # # non linear update of the super node g_new = functions.relu(self.F_super[step](g)) # original --> super transmission h1 = functions.expand_dims(h, 2) #assert h1.shape == (mb, atom, 1, ch) h1 = functions.broadcast_to(h1, [mb, atom, self.n_heads, ch]) h1 = functions.reshape(h1, [mb, atom, self.n_heads* ch]) #assert h1.shape==(mb, atom, self.n_heads * ch) h_j = functions.expand_dims(h, 1) h_j = functions.broadcast_to(h_j, (mb, self.n_heads, atom, ch)) #assert h_j.shape==(mb, self.n_heads, atom, ch) # expand h_super g_extend = functions.expand_dims(g, 1) # assert g_extend.shape==(mb, 1, self.hidden_dim_super) g_extend = functions.broadcast_to(g_extend, (mb, self.n_heads, self.hidden_dim_super)) # assert g_extend.shape==(mb, self.n_heads, self.hidden_dim_super) g_extend = functions.expand_dims(g_extend, 2) # assert g_extend.shape==(mb, self.n_heads, 1, self.hidden_dim_super) # update for attention-message B h_i # mb, atom, n_heads * ch Bh_i = self.B[step](h1) # assert Bh_i.shape==(mb, atom, self.n_heads * self.hidden_dim_super) # mb, atom, num_head, ch Bh_i = functions.reshape(Bh_i, [mb, atom, self.n_heads, self.hidden_dim_super]) # mb, num_head, atom, ch Bh_i = functions.transpose(Bh_i, [0, 2, 1, 3]) # assert Bh_i.shape==(mb, self.n_heads, atom, self.hidden_dim_super) # take g^{T} * B * h_i # indexed by i # mb, self.n_haeds atom(i) b_hi = functions.matmul(g_extend, Bh_i, transb=True) # This will reduce the last hidden_dim_super axis # assert b_hi.shape==(mb, self.n_heads, 1, atom) # softmax. sum/normalize over the last axis. # mb, self.n_heda, atom(i-normzlied) attention_i = functions.softmax(b_hi, axis=3) if self.dropout_ratio > 0.0: attention_i = functions.dropout(attention_i, ratio=self.dropout_ratio) # assert attention_i.shape==(mb, self.n_heads, 1, atom) # element-wise product --> sum over i # mb, num_head, hidden_dim_super attention_sum = functions.matmul(attention_i, h_j) # assert attention_sum.shape==(mb, self.n_heads, 1, ch) attention_sum = functions.reshape(attention_sum, (mb, self.n_heads * ch)) # assert attention_sum.shape==(mb, self.n_heads * ch) # weighting h for different heads h_trans = self.V_super[step](attention_sum) # assert intermediate_h.shape==(mb, self.n_heads * ch) # compress heads h_trans = self.W_super[step](h_trans) h_trans = functions.tanh(h_trans) # assert intermediate_h.shape==(mb, self.hidden_dim_super) # g_trans: super --> original transmission # for local updates g_trans = self.F_super[step](g) g_trans = functions.tanh(g_trans) # assert intermediate_h_super.shape==(mb, self.hidden_dim) g_trans = functions.expand_dims(g_trans, 1) # assert intermediate_h_super.shape==(mb, 1, self.hidden_dim) g_trans = functions.broadcast_to(g_trans, (mb, atom, self.hidden_dim)) # assert intermediate_h_super.shape==(mb, atom, self.hidden_dim) # # Warp Gate unit # z_local = self.H_local[step](h_new) + self.G_local[step](g_trans) z_local = functions.broadcast_to(z_local, (mb, atom, self.hidden_dim)) if self.dropout_ratio > 0.0: z_local = functions.dropout(z_local,ratio=self.dropout_ratio) z_local = functions.sigmoid(z_local) merged_h = (1.0-z_local) * h_new + z_local * g_trans # assert new_h.shape==(mb, atom, ch) z_super = self.H_super[step](h_trans) + self.G_super[step](g_new) z_super = functions.broadcast_to(z_super, (mb, self.hidden_dim_super)) if self.dropout_ratio > 0.0: z_super = functions.dropout(z_super,ratio=self.dropout_ratio) z_super = functions.sigmoid(z_super) merged_g = (1.0-z_super) * h_trans + z_super * g_new # assert out_h_super.shape==(mb, self.hidden_dim_super) # # Self recurrent # out_h = functions.reshape(merged_h, (mb * atom, self.hidden_dim)) out_h = self.GRU_local(out_h) out_h = functions.reshape(out_h, (mb, atom, self.hidden_dim)) out_g = self.GRU_super(merged_g) return out_h, out_g
def check_backward(self, x_data): x = chainer.Variable(x_data) y = functions.expand_dims(x, self.axis) y.grad = y.data y.backward() testing.assert_allclose(x.data, x.grad, atol=0, rtol=0)