def original(self, enc_hs, dec_z, att_prev, scaling=2.0): '''AttLoc forward :param enc_hs: :param dec_z: :param att_prev: :param scaling: :return: ''' batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h) if dec_z is None: dec_z = chainer.Variable(self.xp.zeros( (batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [self.xp.full( hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape() # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv( F.reshape(att_prev, (batch, 1, 1, self.h_length))) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = linear_tensor(self.mlp_att, att_conv) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to( F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze(linear_tensor(self.gvec, F.tanh( att_conv + self.pre_compute_enc_h + dec_z_tiled)), axis=2) # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w
def loss_Critic(self, dis_c, alpha, dis_y1, dis_y2): xp = chainer.backend.get_array_module(dis_c.data) batchsize = len(dis_c) loss = F.sum((F.squeeze(dis_c)-F.squeeze(alpha))**2) / batchsize loss_ = F.sum(dis_y1**2) / batchsize loss_ += F.sum(dis_y2**2) / batchsize loss += loss_/2 chainer.report({'Critic_loss':loss}) return loss
def cos_sim(x, y): # batchsize = 1のときsqueezeでエラー if len(x.shape) > 2: norm_x = F.normalize(F.squeeze(F.squeeze(x,axis=(2,)),axis=(2,))) norm_y = F.normalize(F.squeeze(F.squeeze(y,axis=(2,)),axis=(2,))) else: norm_x = F.normalize(x) norm_y = F.normalize(y) return F.batch_matmul(norm_x, norm_y, transa=True)
def __call__(self, x_block, y_in_block, y_out_block): batch = len(x_block) #embed ex_block = F.dropout(self.make_input_embedding(self.embed_x, x_block), self.dropout) ey_block = F.dropout( self.make_input_embedding(self.embed_y, y_in_block), self.dropout) eyy_block = F.dropout( self.make_input_embedding(self.embed_yy, y_in_block), self.dropout) eys = F.transpose(ey_block, (0, 2, 1)) eyys = F.transpose(eyy_block, (0, 2, 1)) #gcnn h = F.expand_dims(ex_block, axis=1) for i in range(self.stack): h = self.gcnn[i](h) h = F.dropout(F.squeeze(h, axis=1), self.dropout) #Nsteolstm eys2 = [i for i in eys] eyys2 = [i for i in eyys] _, _, oss = self.decoder(None, None, eys2) _, _, oss2 = self.decoder2(None, None, eyys2) ss = F.stack(oss, axis=0) ss2 = F.stack(oss2, axis=0) #mask_make mask = (y_in_block[:, :, None] >= 0) * self.xp.ones( (self.batch, 1, self.n_units), dtype=bool) ss = F.where(mask, ss, self.xp.full(ss.shape, 0, 'f')) #weight_calclate batch_A = F.batch_matmul(ss, h) * self.scale_score mask = (x_block[:, 0:len(x_block[0]) - self.stack * (self.width - 1)][:, None, :] >= 0) * (y_in_block[:, :, None] >= 0) batch_A = F.where(mask, batch_A, self.xp.full(batch_A.shape, -self.xp.inf, 'f')) batch_A = F.softmax(batch_A, axis=2) batch_A = F.where(self.xp.isnan(batch_A.data), self.xp.zeros(batch_A.shape, 'f'), batch_A) batch_A, h = F.broadcast(batch_A[:, None], h[:, :, None]) batch_C = F.sum(batch_A * h, axis=3) e = F.transpose(batch_C, (0, 2, 1)) e = F.squeeze(F.concat(F.split_axis(e, self.batch, axis=0), axis=1)) ss2 = F.squeeze(F.concat(F.split_axis(ss2, self.batch, axis=0), axis=1)) t = (self.We(e) + self.Ws(ss2)) t = F.dropout(t, self.dropout) concat_ys_out = F.concat(y_out_block, axis=0) loss = F.sum(F.softmax_cross_entropy(t, concat_ys_out, reduce='no')) / batch chainer.report({'loss': loss.data}, self) n_words = concat_ys_out.shape[0] perp = self.xp.exp(loss.data * batch / n_words) chainer.report({'perp': perp}, self) return loss
def _sample_state(self, transision, s_shape=(32, 7), z_shape=(32, 4)): s_current = self._Uniform.sample(sample_shape=s_shape) s_current = F.squeeze(s_current) s_next, _ = transision(s_current) z = self._Normal.sample(sample_shape=z_shape) z = F.squeeze(z) assert s_shape == s_current.shape assert s_shape == s_next.shape assert z_shape == z.shape return s_current, s_next, z
def __call__(self, x, enc_out=None, mask=None): """ args x: paralleled main features in the model Variable in (batch, hidden_dim, length) u: hidden features from Encoder Variable in (batch, hidden_dim, length) mask: padding-mask or future-mask xp-array in (batch, length, length) an element takes 'False' when pad/future, otherwise 'True' returns """ # ksize-1-convolution results in parallel linear projections if self.self_attention: qkv = F.squeeze(self.W(F.expand_dims(x, axis=3)), axis=3) query, key, value = F.split_axis(qkv, 3, axis=1) else: query = F.squeeze(self.W_Q(F.expand_dims(x, axis=3)), axis=3) kv = F.squeeze(self.W_KV(F.expand_dims(enc_out, axis=3)), axis=3) key, value = F.split_axis(kv, 2, axis=1) # make q,k,v into (batch*parallel, dim/parallel, length)shape query = F.concat(F.split_axis(query, self.parallel_num, axis=1), axis=0) key = F.concat(F.split_axis(key, self.parallel_num, axis=1), axis=0) value = F.concat(F.split_axis(value, self.parallel_num, axis=1), axis=0) mask = self.xp.concatenate([mask] * self.parallel_num, axis=0) attention_weight = F.batch_matmul(query, key, transa=True) * self.scale attention_weight = F.where( mask, attention_weight, self.xp.full(attention_weight.shape, -np.inf, dtype=np.float32)) attention_weight = F.softmax(attention_weight, axis=2) attention_weight = F.dropout(attention_weight, self.dropout_rate) attention_weight = F.where( self.xp.isnan(attention_weight.data), self.xp.full(attention_weight.shape, 0, dtype=np.float32), attention_weight) self.attention_weight = copy.deepcopy(attention_weight.data) # attention: (batch, q-length, k-length) -> (batch, 1, q-length, k-length) # value: (batch, dim/parallel, k-length) -> (batch, dim/parallel, 1, k-length) attention_weight, value = F.broadcast(attention_weight[:, None], value[:, :, None]) weighted_sum = F.sum(attention_weight * value, axis=3) weighted_sum = F.concat(F.split_axis(weighted_sum, self.parallel_num, axis=0), axis=1) weighted_sum = F.squeeze(self.linear( F.expand_dims(weighted_sum, axis=3)), axis=3) return weighted_sum
def translate(self, xs, max_length=100): xs = numpy.insert(xs, 0, 2) xs = numpy.append(xs, 0) with chainer.no_backprop_mode(), chainer.using_config('train', False): exs = self.embed_x(Variable(self.xp.array(xs, dtype=self.xp.int32))) h = F.expand_dims(exs, axis=0) h = F.expand_dims(h, axis=0) h = F.transpose(h, (0, 1, 3, 2)) for i in range(self.stack): h = self.gcnn[i](h) h = F.squeeze(h, axis=1) h = F.squeeze(h, axis=0) h = F.transpose(h, (1, 0)) ys = self.xp.full(1, 2, self.xp.int32) result = [] hx = None cx = None hx2 = None cx2 = None for i in range(max_length): eys = self.embed_y(ys) eyys = self.embed_yy(ys) eys2 = [eys] eyys2 = [eyys] hx, cx, ss = self.decoder(hx, cx, eys2) hx2, cx2, ss2 = self.decoder2(hx2, cx2, eyys2) batch_A = F.matmul(h, ss[0], transb=True) * self.scale_score batch_A = F.softmax(batch_A, axis=0) if self.weight: with open("weight/wei.txt", "a", encoding="utf-8") as f: for j in range(len(batch_A)): f.write(str(batch_A[j][0].data) + "\n") f.write("--------------\n") s = F.matmul(batch_A, h, transa=True) t = (self.We(s) + self.Ws(ss2[0])) ys = self.xp.argmax(t.data, axis=1).astype(self.xp.int32) if ys[0] == 0: break result.append(ys) result = cuda.to_cpu( self.xp.concatenate([self.xp.expand_dims(x, 0) for x in result]).T) # Remove EOS taggs outs = [] for y in result: inds = numpy.argwhere(y == EOS) if len(inds) > 0: y = y[:inds[0, 0]] outs.append(y) return outs
def compute_kl_after_update(loss_func, n=100): policy = copy.deepcopy(base_policy) optimizer = chainer.optimizers.SGD(1e-4) optimizer.setup(policy) for _ in range(n): distrib = policy(x) policy.cleargrads() F.squeeze(loss_func(distrib)).backward() optimizer.update() distrib_after = policy(x) return float(another_distrib.kl(distrib_after).array)
def update_Q(): # Predicted values: Q(s,a) y = F.squeeze(Q(obs, action), axis=1) # Target values: r + gamma * Q(s,policy(s)) with chainer.no_backprop_mode(): next_q = F.squeeze(target_Q(obs_next, target_policy(obs_next)), axis=1) target = reward + gamma * (1 - done) * next_q loss = F.mean_squared_error(y, target) Q.cleargrads() loss.backward() opt_Q.update()
def bdot(x, y): """ batch direct product Not to be confused with Exterior product. :param x: batch times n :param y: batch times n :return: batch dimension """ assert x.shape[0] == y.shape[0] assert x.shape[1] == y.shape[1] xT = F.expand_dims(x, 1) y = F.expand_dims(y, 2) res = F.squeeze(F.squeeze(xT @ y, axis=1), axis=1) return res
def update_core(self): gen_optimizer = self.get_optimizer('opt_gen') dis_optimizer = self.get_optimizer('opt_dis') xp = self.gen.xp opt = self.opt batch = self.get_iterator('main').next() batchsize = len(batch) x = denoise.add_noise(batch, self.opt) x = utils.prepare_data_for_cnn(x, opt.maxlen, opt.filter_shape) x_org = utils.prepare_data_for_rnn(batch, opt.maxlen, opt.sent_len, opt.n_words, is_add_GO=True) x = xp.array(x, dtype=np.int32) x_org = xp.array(x_org, dtype=np.int32) # generator syn_sents, prob = self.gen(x, x_org) # prob: fake data # discriminator logits_real, H_real = self.dis(x) logits_fake, H_fake = self.dis(prob, is_prob=True) # one hot vector labels_one = xp.ones((batchsize), dtype=xp.int32) # 1-dim array labels_zero = xp.zeros((batchsize), dtype=xp.int32) labels_fake = labels_zero #F.concat([labels_one, labels_zero], axis=1) labels_real = labels_one #F.concat([labels_zero, labels_one], axis=1) D_loss = F.softmax_cross_entropy(logits_real, labels_real) + \ F.softmax_cross_entropy(logits_fake, labels_fake) G_loss = compute_MMD_loss(F.squeeze(H_fake), F.squeeze(H_real)) self.gen.cleargrads() G_loss.backward() gen_optimizer.update() self.dis.cleargrads() D_loss.backward() dis_optimizer.update() H_fake.unchain_backward() H_real.unchain_backward() prob.unchain_backward() chainer.reporter.report({'loss_gen': G_loss}) chainer.reporter.report({'loss_dis': D_loss})
def bquad(x, Q): """ calcuate x^T Q x :param x: vector batch times n :param Q: batch times n times n :return: batch dim """ assert x.shape[0] == Q.shape[0], "batch mismatch" + str(x.shape) + ":" + str(Q.shape) assert x.shape[1] == Q.shape[1], "mat mul dim mismatch" assert Q.shape[2] == Q.shape[1], "Q is not square matrix" xT = F.expand_dims(x, 1) x_ = F.expand_dims(x, 2) res = F.squeeze(F.squeeze(xT @ Q @ x_, axis=1), axis=1) assert list(res.shape) == [list(x.shape)[0]] return res
def bias_correction_policy_gradients(truncation_threshold): gs = [] for sample in mu_samples: base_policy.cleargrads() loss = acer.compute_policy_gradient_loss( action=sample, advantage=evaluate_action(sample), action_distrib=pi, action_distrib_mu=mu, action_value=action_value, v=0, truncation_threshold=truncation_threshold) F.squeeze(loss).backward() gs.append(extract_gradients_as_single_vector(base_policy)) return gs
def get_onehot_grad(self, xs, ys=None): if ys is None: with chainer.using_config('train', False): ys = self.predict(xs, argmax=True) u, exs_prem = self.encoder.get_grad(xs[0]) v, exs_hypo = self.encoder.get_grad(xs[1]) encodings = F.concat((u, v, F.absolute(u - v), u * v), axis=1) outputs = self.output(self.mlp(encodings, no_dropout=True)) loss = F.softmax_cross_entropy(outputs, ys) exs = exs_hypo lengths = [len(x) for x in xs[1]] if isinstance(exs, tuple): exs_grad = chainer.grad([loss], exs) ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]]) exs = F.concat(exs, axis=0) exs_grad = F.concat(exs_grad, axis=0) onehot_grad = F.sum(exs_grad * exs, axis=1) onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0) else: exs_grad = chainer.grad([loss], [exs])[0] # (batch_size, n_dim, max_length, 1) assert exs_grad.shape == exs.shape onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2) onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)] return onehot_grad
def get_onehot_grad(self, xs, ys=None): if ys is None: with chainer.using_config('train', False): ys = self.predict(xs, argmax=True) ys = F.expand_dims(ys, axis=1) ys = [y for y in ys] encodings, exs = self.encoder.get_grad(xs) outputs = self.output(encodings) concat_truths = F.concat(ys, axis=0) loss = F.softmax_cross_entropy(outputs, concat_truths) if isinstance(exs, tuple): exs_grad = chainer.grad([loss], exs) ex_sections = np.cumsum([ex.shape[0] for ex in exs[:-1]]) exs = F.concat(exs, axis=0) exs_grad = F.concat(exs_grad, axis=0) onehot_grad = F.sum(exs_grad * exs, axis=1) onehot_grad = F.split_axis(onehot_grad, ex_sections, axis=0) else: exs_grad = chainer.grad([loss], [exs])[0] # (batch_size, n_dim, max_length, 1) assert exs_grad.shape == exs.shape onehot_grad = F.squeeze(F.sum(exs_grad * exs, 1), 2) lengths = [len(x) for x in xs] onehot_grad = [x[:l] for x, l in zip(onehot_grad, lengths)] return onehot_grad
def lstm_first_forward_func(self, xs): # xs T,F,in_size xp = chainer.cuda.cupy.get_array_module(xs[0].data) hx = None cx = None xs = F.transpose(xs, axes=(1, 0, 2)) # shape = F,T,in_size xs = [ F.squeeze(e) for e in F.split_axis(xs, xs.shape[0], axis=0, force_tuple=True) ] _, _, hs = self.lstm(xs) # hs is list of T x D variable hs = F.stack(hs) box_num, frame, _ = hs.shape hs = F.reshape(hs, (-1, hs.shape[-1])) hs = F.relu(self.fc2(hs)) hs = F.reshape(hs, shape=(box_num, frame, -1)) hs = F.transpose(hs, axes=(1, 0, 2)) # shape = T, F, 1024 for relation_module_str in self.relation_module_name[:len( self.relation_module_name) // 2]: hs = getattr(self, relation_module_str)(hs, hs, hs) # shape = T,F, 1024 hs = F.reshape(hs, (-1, hs.shape[-1])) hs = F.relu(self.fc3(hs)) hs = F.reshape(hs, shape=(frame, box_num, -1)) for relation_module_str in self.relation_module_name[ len(self.relation_module_name) // 2:]: hs = getattr(self, relation_module_str)(hs, hs, hs) # shape = T,F, 1024 hs = F.reshape(hs, (-1, hs.shape[-1])) hs = self.fc4(hs) hs = F.reshape(hs, shape=(frame, box_num, self.out_size)) return hs
def attend(self, encoded_features): self.out_lstm.reset_state() transformed_encoded_features = F.concat([ F.expand_dims(self.transform_encoded_features(feature), axis=1) for feature in encoded_features ], axis=1) concat_encoded_features = F.concat( [F.expand_dims(e, axis=1) for e in encoded_features], axis=1) lstm_output = self.xp.zeros_like(encoded_features[0]) outputs = [] for _ in range(self.num_labels): transformed_lstm_output = self.transform_out_lstm_feature( lstm_output) attended_feats = [] for transformed_encoded_feature in F.separate( transformed_encoded_features, axis=1): attended_feat = transformed_encoded_feature + transformed_lstm_output attended_feat = F.tanh(attended_feat) attended_feats.append( self.generate_attended_feat(attended_feat)) attended_feats = F.concat(attended_feats, axis=1) alphas = F.softmax(attended_feats, axis=1) lstm_input_feature = F.batch_matmul(alphas, concat_encoded_features, transa=True) lstm_input_feature = F.squeeze(lstm_input_feature, axis=1) lstm_output = self.out_lstm(lstm_input_feature) outputs.append(lstm_output) return outputs
def gcams_to_mask(gcams_from_chainer, class_ids, dataset=None, img=None): if len(class_ids) == 0: return None gcams_np = [] gcam_aggregate = None for i in range(len(gcams_from_chainer)): # gcam for class i gcams_np.append(cp.asnumpy( F.squeeze(gcams_from_chainer[i][0], 0).data)) print(class_ids) for i in range(len(gcams_np)): # so earlier indices will have brighter heatmaps gcam_np = gcams_np[i] print("Max gcam magnitude for {}: ".format(class_names[ int(class_ids[i]) + 1]) + str(np.max(gcams_np[i]))) print("Min gcam magnitude for {}: ".format(class_names[ int(class_ids[i]) + 1]) + str(np.min(gcams_np[i]))) mask = _gcam_to_mask(gcam_np, int(class_ids[i])) assert mask != None cv2.imshow("mask", np.uint8(mask)) cv2.waitKey(0) if gcam_aggregate is None: gcam_aggregate = mask else: gcam_aggregate = gcam_aggregate + mask return gcam_aggregate
def __call__(self, x, z): """ Args: x (~chainer.Variable): Batch of input vectors. z (~chainer.Variable): Batch of context vectors. Returns: ~chainer.Variable: Output of the context layer. """ if self.has_uninitialized_params: with cuda.get_device(self._device_id): self._initialize_params(x.size // x.shape[0]) batch_size = x.shape[0] # compute adaptive filter W = self.predictor(z) # reshape linear W to the correct size W = F.reshape(W, [batch_size] + self.shape) # add constant W if defined if self.constantW: W += F.tile(self.C, (batch_size, 1, 1)) # multiply weights with inputs in batch mode y = F.squeeze(F.batch_matmul(W, x), 2) # add bias y += F.tile(self.b, tuple([batch_size, 1])) return y
def masked_self_attention(self, input, adj, step): adj = np.sum(adj, axis=1) # [mb, atoms, ch] mb, atoms, ch = input.shape attention_layer_index = 0 if self.attention_tying else step # [mb, atoms, hidden_dim] h = functions.reshape(input, shape=(mb * atoms, ch)) h = self.linear_transform_layer[attention_layer_index](h) h = functions.reshape(h, shape=(mb, atoms, -1)) # [mb, atoms, atoms, 2 * hidden_dim] a_input = functions.concat([functions.tile(h, reps=(1, 1, atoms)).reshape(mb, atoms * atoms, -1), functions.tile(h, reps=(1, atoms, 1))], axis=-1).reshape(mb, atoms, atoms, 2 * self.hidden_dim) a_input = functions.reshape(a_input, shape=(mb * atoms * atoms, 2 * self.hidden_dim)) # [mb * atoms * atoms, 2 * hidden_dim] => [mb * atoms * atoms, 1] => [mb, atoms * atoms] e = functions.leaky_relu( functions.reshape(functions.squeeze(self.neural_network_layer[attention_layer_index](a_input), axis=-1), shape=(mb, atoms, atoms))) # [mb, atoms, atoms] zero_vec = -9e15 * self.xp.ones_like(e, dtype=self.xp.float32) # [mb, atoms, atoms] attention = functions.where(adj > 0, e, zero_vec) # [mb, atoms, atoms] attention = functions.softmax(attention, axis=2) # [mb, atoms, atoms] * [mb, atoms, hidden_dim] => [mb, atoms, hidden_dim] h_prime = functions.matmul(attention, h) h_prime = functions.elu(h_prime) return h_prime
def forward(self, ws, cs, ls, dep_ts=None): ws = map(self.emb_word, ws) cs = [F.squeeze( F.max_pooling_2d( self.conv_char( F.expand_dims( self.emb_char(c), 1)), (int(l[0]), 1))) for c, l in zip(cs, ls)] xs_f = [F.dropout(F.concat([w, c]), 0.5) for w, c in zip(ws, cs)] xs_b = [x[::-1] for x in xs_f] _, _, hs_f = self.lstm_f(None, None, xs_f) _, _, hs_b = self.lstm_b(None, None, xs_b) hs_b = [x[::-1] for x in hs_b] hs = [F.concat([h_f, h_b]) for h_f, h_b in zip(hs_f, hs_b)] dep_ys = [self.biaffine_arc( F.elu(F.dropout(self.arc_dep(h), 0.32)), F.elu(F.dropout(self.arc_head(h), 0.32))) for h in hs] if dep_ts is not None: heads = dep_ts else: heads = [F.argmax(y, axis=1) for y in dep_ys] cat_ys = [self.biaffine_tag( F.elu(F.dropout(self.rel_dep(h), 0.32)), F.elu(F.dropout(self.rel_head( F.embed_id(t, h, ignore_label=IGNORE)), 0.32))) for h, t in zip(hs, heads)] return cat_ys, dep_ys
def encode(self, image, obj, desc, num): xp = cuda.cupy cuda.get_device(GPU.gpus_to_use[num % GPU.num_gpus]).use() obj = np.asarray(obj, dtype=np.float32) obj = np.repeat(obj[np.newaxis], image.shape[0], axis=0) desc = np.asarray(desc, dtype=np.float32) desc = np.repeat(desc[np.newaxis], image.shape[0], axis=0) o_in = cuda.to_gpu(obj, GPU.gpus_to_use[num % GPU.num_gpus]) d_in = cuda.to_gpu(desc, GPU.gpus_to_use[num % GPU.num_gpus]) x_in = cuda.to_gpu(image, GPU.gpus_to_use[num % GPU.num_gpus]) att, _, _ = self.enc_models[num % 2](Variable(x_in), Variable(o_in), Variable(d_in), train=False) att = F.reshape(att, (-1, 1, self.att_size, self.att_size)) att = F.resize_images(att, (self.image_size, self.image_size)) cir_z, _, _, _ = self.att_enc_models[num % 2](Variable(x_in) * att, train=False) return cir_z, F.squeeze(F.concat((o_in[0], d_in[0]), axis=-1))
def __call__(self, S, h): batch_size, src_len, hidden_size = S.data.shape S = self.inner_weight(F.reshape(S, (batch_size * src_len, hidden_size))) S = F.reshape(S, (batch_size, src_len, hidden_size)) a = F.softmax(F.squeeze(F.batch_matmul(S, h), axis=2)) return a
def forward(self, x): batchsize = x.shape[0] assert self.is_convdim_compatible(x),\ 'kernel dim %d is not compatible to input spatial dim %d' % (self.gm.CONV_DIM, len(x.shape) - 2) Z = F.reshape(x, self.gm.get_dims(tensor_id=0, expanded=True).tolist()) image_flags = self.gm.indices2flags(self.gm.get_image_indices()) filter_flags = self.gm.indices2flags(self.gm.get_filter_indices()) for tensor_id in range(1, self.gm.num_tensors): logging.debug('Next processing:') logging.debug(tensor_id) sum_flags = self.gm.indices2flags( self.gm.get_sum_indices(tensor_id)) Z = expanded_einconv(Z, self.get_param(tensor_id), sum_flags, image_flags, filter_flags, self.xp) if self.gm.is_relu(tensor_id) and tensor_id < ( self.gm.num_tensors - 1): Z = F.relu(Z) if self.batchnorm: Z = self.get_bn(tensor_id)(Z) Z = F.squeeze(Z) for i, d in enumerate(self.gm.dims[image_flags].tolist()): if d == 1: Z = F.expand_dims(Z, i + 2) if batchsize == 1: Z = F.expand_dims(Z, 0) return Z
def __call__(self, id, x): W = self.W_embedding(id) b = F.squeeze(self.b_embedding(id)) # Reshape the vector to be the right dimensions for 2D conv W = F.reshape(W, (self.out_channels, self.in_channels, self.kh, self.kw)) return F.convolution_2d(x, W, b, self.stride, self.pad)
def predict(self, xs): """ batch: list of splitted sentences """ xs = [self.extractor.process(x) for x in xs] batchsize = len(xs) ws, cs, ls = zip(*xs) ws = map(self.emb_word, ws) cs = [ F.squeeze( F.max_pooling_2d( self.conv_char(F.expand_dims(self.emb_char(c), 1)), (l, 1))) for c, l in zip(cs, ls) ] xs_f = [ F.dropout(F.concat([w, c]), self.dropout_ratio, train=self.train) for w, c in zip(ws, cs) ] xs_b = [x[::-1] for x in xs_f] cx_f, hx_f, cx_b, hx_b = self._init_state(batchsize) _, _, hs_f = self.lstm_f(hx_f, cx_f, xs_f, train=self.train) _, _, hs_b = self.lstm_b(hx_b, cx_b, xs_b, train=self.train) hs_b = [x[::-1] for x in hs_b] ys = [ self.linear2(F.relu(self.linear1(F.concat([h_f, h_b])))) for h_f, h_b in zip(hs_f, hs_b) ] return [y.data[1:-1] for y in ys]
def forward(self, equery, vmemory, ememory, mask, iteration=0): """Compute an attention over memory given the query.""" # equery.shape == (..., E) # vmemory.shape == (..., Ms, M) # ememory.shape == (..., Ms, E) # mask.shape == (..., Ms) # Setup memory embedding eq = F.repeat(equery[..., None, :], vmemory.shape[-2], -2) # (..., Ms, E) # Compute content based attention merged = F.concat( [eq, ememory, eq * ememory, F.squared_difference(eq, ememory)], -1) # (..., Ms, 4*E) inter = self.att_linear(merged, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, E) inter = F.tanh(inter) # (..., Ms, E) inter = F.dropout(inter, DROPOUT) # (..., Ms, E) # Split into sentences lengths = np.sum(np.any((vmemory != 0), -1), -1) # (...,) mems = [s[..., :l, :] for s, l in zip(F.separate(inter, 0), lengths) ] # B x [(M1, E), (M2, E), ...] _, bimems = self.att_birnn(None, mems) # B x [(M1, 2*E), (M2, 2*E), ...] bimems = F.pad_sequence(bimems) # (..., Ms, 2*E) att = self.att_score(bimems, n_batch_axes=len(vmemory.shape) - 1) # (..., Ms, 1) att = F.squeeze(att, -1) # (..., Ms) if mask is not None: att += mask * MINUS_INF # (..., Ms) return att
def __call__(self, x, t, dataset, train=True): # Create variables x = Variable(x) x.to_gpu(self.gpu_id) t = Variable(t) t.to_gpu(self.gpu_id) # Config mode if len(t.shape) == 3: config_mode = 'segmentation' elif len(t.shape) == 2: config_mode = 'recognition' else: raise ValueError('label format is not supported') # Forward with chainer.using_config('train', train): with chainer.using_config('enable_backprop', train): # InceptionV3 backbone x = self.predictor(x) # Classifiers classifier_indx = self.args.dataset.split('+').index(dataset) y = self.classifiers[classifier_indx](x, train) # Loss if config_mode == 'segmentation': self.y = F.resize_images(y, t.shape[-2:]) # Upsampling logits self.loss = F.softmax_cross_entropy(self.y, t) elif config_mode == 'recognition': self.y = F.squeeze(F.average_pooling_2d( y, ksize=y.shape[-2:]), axis=(2, 3)) # Global Average Pooling self.loss = F.sigmoid_cross_entropy(self.y, t) # Backward if train: # Clear grads for uninitialized params self.cleargrads() # Backwards self.loss.backward() # Reporter if config_mode == 'segmentation': self.y = F.argmax(self.y, axis=1) self.y.to_cpu() t.to_cpu() result = eval_semantic_segmentation(list(self.y.data), list(t.data)) del result['iou'], result['class_accuracy'] result.update({'loss': self.loss.data.tolist()}) self.reporter.update({dataset: result}) elif config_mode == 'recognition': self.reporter.update({ dataset: { 'loss': self.loss.data.tolist(), 'prediction': F.sigmoid(self.y).data.tolist(), 'groundtruth': t.data.tolist() } })
def __call__(self, encs, hiddens, batch_size, prev_image, num_masks, color_channels): """ Learn through StatelessCDNA. Args: encs: An array of computed transformation hiddens: An array of hidden layers batch_size: Size of mini batches prev_image: The image to transform num_masks: Number of masks to apply color_channels: Output color channels Returns: transformed: A list of masks to apply on the previous image """ logger = logging.getLogger(__name__) enc0, enc1, enc2, enc3, enc4, enc5, enc6 = encs hidden1, hidden2, hidden3, hidden4, hidden5, hidden6, hidden7 = hiddens img_height = prev_image.shape[2] img_width = prev_image.shape[3] # CDNA specific enc7 = self.enc7(enc6) enc7 = F.relu(enc7) transformed_list = list([F.sigmoid(enc7)]) # CDNA specific # Predict kernels using linear function of last layer cdna_input = F.reshape(hidden5, (int(batch_size), -1)) cdna_kerns = self.cdna_kerns(cdna_input) # Reshape and normalize # B x C x H x W => B x NUM_MASKS x 1 x H x W cdna_kerns = F.reshape(cdna_kerns, (int(batch_size), self.num_masks, 1, DNA_KERN_SIZE, DNA_KERN_SIZE)) cdna_kerns = F.relu(cdna_kerns - RELU_SHIFT) + RELU_SHIFT norm_factor = F.sum(cdna_kerns, (2, 3, 4), keepdims=True) cdna_kerns = broadcasted_division(cdna_kerns, norm_factor) # Treat the color channel dimension as the batch dimension since the same # transformation is applied to each color channel. # Treat the batch dimension as the channel dimension so that # F.depthwise_convolution_2d can apply a different transformation to each sample. cdna_kerns = F.reshape(cdna_kerns, (int(batch_size), self.num_masks, DNA_KERN_SIZE, DNA_KERN_SIZE)) cdna_kerns = F.transpose(cdna_kerns, (1, 0, 2, 3)) # Swap the batch and channel dimension. prev_image = F.transpose(prev_image, (1, 0, 2, 3)) # Transform the image. transformed = F.depthwise_convolution_2d(prev_image, cdna_kerns, stride=(1, 1), pad=DNA_KERN_SIZE/2) # Transpose the dimensions where they belong. transformed = F.reshape(transformed, (color_channels, int(batch_size), self.num_masks, img_height, img_width)) transformed = F.transpose(transformed, (2, 1, 0, 3, 4)) transformed = F.split_axis(transformed, indices_or_sections=self.num_masks, axis=0) transformed = [F.squeeze(t, axis=0) for t in transformed] transformed_list += transformed return transformed_list, enc7
def __call__(self, x, lam): h = GRL(lam)(x) h = F.dropout(F.relu(self.bn1(self.fc1(h))), 0.2) h = self.fc2(h) return F.squeeze(h)
def query(self, u): xp = cuda.get_array_module(u) size = self.m.shape[1] inds = xp.arange(size - 1, -1, -1, dtype=numpy.int32) tm = self.TA(inds) tc = self.TC(inds) tm = F.broadcast_to(tm, self.m.shape) tc = F.broadcast_to(tc, self.c.shape) p = F.softmax(F.batch_matmul(self.m + tm, u)) o = F.batch_matmul(F.swapaxes(self.c + tc, 2, 1), p) o = F.squeeze(o, -1) u = o + u return u
def forward(self, inputs): """ Parameters ---------- inputs: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch. Returns ------- Dict with keys: ``'activations'``: ``List[torch.autograd.Variable]`` A list of activations at each layer of the network, each of shape ``(batch_size, timesteps + 2, embedding_dim)`` ``'mask'``: ``torch.autograd.Variable`` Shape ``(batch_size, timesteps + 2)`` long tensor with sequence mask. Note that the output tensors all include additional special begin and end of sequence markers. """ token_embedding = self._token_embedder.forward(inputs) type_representation = token_embedding['token_embedding'] mask = token_embedding['mask'] lstm_outputs = self._elmo_lstm.forward(type_representation, mask) # Prepare the output. The first layer is duplicated. output_tensors = [ F.concat([type_representation, type_representation], axis=-1) ] for layer_activations in F.split_axis(lstm_outputs, lstm_outputs.shape[0], axis=0): output_tensors.append(F.squeeze(layer_activations, 0)) return { 'activations': output_tensors, 'mask': mask, }
def check_backward(self, x_data, g_data): gradient_check.check_backward( lambda x: functions.squeeze(x, self.axis), x_data, g_data, **self.check_backward_options)
def forward(self, inputs, device): x, = inputs return functions.squeeze(x, axis=self.axis),
def forward(self, inputs, batch_lengths, initial_state=None): """ Parameters ---------- inputs : ``torch.FloatTensor``, required. A tensor of shape (batch_size, num_timesteps, input_size) to apply the LSTM over. batch_lengths : ``List[int]``, required. A list of length batch_size containing the lengths of the sequences in batch. initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). Returns ------- output_accumulator : ``torch.FloatTensor`` The outputs of the LSTM for each timestep. A tensor of shape (batch_size, max_timesteps, hidden_size) where for a given batch element, all outputs past the sequence length for that batch are zero tensors. final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]`` A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). """ batch_size = inputs.shape[0] total_timesteps = inputs.shape[1] output_accumulator_list = [] if initial_state is None: full_batch_previous_memory = chainer.Variable( self.xp.zeros((batch_size, self.cell_size), 'f')) full_batch_previous_state = chainer.Variable( self.xp.zeros((batch_size, self.hidden_size), 'f')) else: # first dimension is just (layer * (1 + is_bidirection)), i.e., 1. full_batch_previous_state = F.squeeze(initial_state[0], axis=0) full_batch_previous_memory = F.squeeze(initial_state[1], axis=0) current_length_index = batch_size - 1 if self.go_forward else 0 if self.recurrent_dropout_probability > 0.0 and \ (self.training or chainer.confing.train): dropout_mask = get_dropout_mask(self.recurrent_dropout_probability, full_batch_previous_state) else: dropout_mask = None for timestep in range(total_timesteps): # The index depends on which end we start. index = timestep if self.go_forward else total_timesteps - timestep - 1 # What we are doing here is finding the index into the batch dimension # which we need to use for this timestep, because the sequences have # variable length, so once the index is greater than the length of this # particular batch sequence, we no longer need to do the computation for # this sequence. The key thing to recognise here is that the batch inputs # must be _ordered_ by length from longest (first in batch) to shortest # (last) so initially, we are going forwards with every sequence and as we # pass the index at which the shortest elements of the batch finish, # we stop picking them up for the computation. if self.go_forward: while batch_lengths[current_length_index] <= index: current_length_index -= 1 # If we're going backwards, we are _picking up_ more indices. else: # First conditional: Are we already at the maximum number of elements in the batch? # Second conditional: Does the next shortest sequence beyond the current batch # index require computation use this timestep? while current_length_index < (len(batch_lengths) - 1) and \ batch_lengths[current_length_index + 1] > index: current_length_index += 1 # Actually get the slices of the batch which we # need for the computation at this timestep. # shape (batch_size, cell_size) previous_memory = full_batch_previous_memory[0: current_length_index + 1] # Shape (batch_size, hidden_size) previous_state = full_batch_previous_state[0: current_length_index + 1] # Shape (batch_size, input_size) timestep_input = inputs[0: current_length_index + 1, index] # Do the projections for all the gates all at once. # Both have shape (batch_size, 4 * cell_size) projected_input = self.input_linearity(timestep_input) projected_state = self.state_linearity(previous_state) # Main LSTM equations using relevant chunks of the big linear # projections of the hidden state and inputs. # TODO: split_axis # TODO: cuda kernel input_gate = F.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] + projected_state[:, (0 * self.cell_size):(1 * self.cell_size)]) forget_gate = F.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] + projected_state[:, (1 * self.cell_size):(2 * self.cell_size)]) memory_init = F.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] + projected_state[:, (2 * self.cell_size):(3 * self.cell_size)]) output_gate = F.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] + projected_state[:, (3 * self.cell_size):(4 * self.cell_size)]) memory = input_gate * memory_init + forget_gate * previous_memory # Here is the non-standard part of this LSTM cell; first, we clip the # memory cell, then we project the output of the timestep to a smaller size # and again clip it. if self.memory_cell_clip_value: memory = F.clip(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value) # shape (current_length_index, cell_size) pre_projection_timestep_output = output_gate * F.tanh(memory) # shape (current_length_index, hidden_size) timestep_output = self.state_projection( pre_projection_timestep_output) if self.state_projection_clip_value: timestep_output = F.clip(timestep_output, -self.state_projection_clip_value, self.state_projection_clip_value) # Only do dropout if the dropout prob is > 0.0 and we are in training mode. if dropout_mask is not None: timestep_output = timestep_output * \ dropout_mask[0: current_length_index + 1] # We've been doing computation with less than the full batch, so here we create a new # variable for the the whole batch at this timestep and insert the result for the # relevant elements of the batch into it. full_batch_previous_memory = F.concat( [memory, full_batch_previous_memory[current_length_index + 1:]], axis=0) full_batch_previous_state = F.concat( [timestep_output, full_batch_previous_state[current_length_index + 1:]], axis=0) output_accumulator_list.append(timestep_output) # Mimic the pytorch API by returning state in the following shape: # (num_layers * num_directions, batch_size, ...). As this # LSTM cell cannot be stacked, the first dimension here is just 1. final_state = (F.expand_dims(full_batch_previous_state, 0), F.expand_dims(full_batch_previous_memory, 0)) if not self.go_forward: output_accumulator_list = output_accumulator_list[::-1] output_accumulator = F.pad_sequence(output_accumulator_list) output_accumulator = output_accumulator.transpose((1, 0, 2)) # (batch_size, total_timesteps, self.hidden_size) return output_accumulator, final_state
def check_forward(self, x_data): y = functions.squeeze(x_data, axis=self.axis) expected = numpy.squeeze(self.x, axis=self.axis) testing.assert_allclose(y.data, expected, **self.check_forward_options)
def check_invalid_type(self, x_data): with self.assertRaises(ValueError): functions.squeeze(x_data, axis=self.axis)
def test_invalid_axis(self): with self.assertRaises(TypeError): functions.squeeze(self.x, axis='a')
def check_invalid_type(self, x_data): with self.assertRaises(type_check.InvalidType): functions.squeeze(x_data, axis=self.axis)
def __call__(self, x): return functions.squeeze(x, self.axis)