def gaussian_kl_divergence(mean, ln_var): """Calculate KL-divergence between given gaussian and the standard one. Given two variable ``mean`` representing :math:`\\mu` and ``ln_var`` representing :math:`\\log(\\sigma^2)`, this function returns a variable representing KL-divergence between given multi-dimensional gaussian :math:`N(\\mu, S)` and the standard Gaussian :math:`N(0, I)` .. math:: D_{\\mathbf{KL}}(N(\\mu, S) \\| N(0, I)), where :math:`S` is a diagonal matrix such that :math:`S_{ii} = \\sigma_i^2` and :math:`I` is an identity matrix. Args: mean (~chainer.Variable): A variable representing mean of given gaussian distribution, :math:`\\mu`. ln_var (~chainer.Variable): A variable representing logarithm of variance of given gaussian distribution, :math:`\\log(\\sigma^2)`. Returns: ~chainer.Variable: A variable representing KL-divergence between given gaussian distribution and the standard gaussian. """ assert isinstance(mean, variable.Variable) assert isinstance(ln_var, variable.Variable) J = mean.data.size var = F.exp(ln_var) return (F.sum(mean * mean) + F.sum(var) - F.sum(ln_var) - J) * 0.5
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def encode_output(self, x_input, layer=1): if layer == 1: return F.sum(self.C_1(x_input), axis=1) elif layer == 2: return F.sum(self.C_2(x_input), axis=1) elif layer == 3: return F.sum(self.C_3(x_input), axis=1)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g = F.broadcast_to( F.gaussian( np.array([0], dtype=np.float32), np.array([np.exp(1)], dtype=np.float32)), x.shape) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def __call__(self, x, y): """ Parameters ----------------- x: Variable Feature of unlabeled samples. y: Variable Feature of unlabeled samples. """ g, x, y = F.broadcast(*[self.gamma, x, y]) x_g = x * g y_g = y * g x_g_norm = F.sum(x_g**2, axis=1) y_g_norm = F.sum(y_g**2, axis=1) x_g_y_g = F.linear(x_g, y_g) x_g_norm, x_g_y_g, y_g_norm = \ F.broadcast( *[x_g_norm, x_g_y_g, F.expand_dims(y_g_norm, 1)]) #F.exp(- (x_g_norm - 2 * x_g_y_g+ y_g_norm)) u = x_g_norm - 2 * x_g_y_g+ y_g_norm print(np.min(u.data)) print(len((np.where(u.data < 0)[0])), np.prod(u.data.shape)) time.sleep(0.5) return F.exp(- x_g_norm + 2 * x_g_y_g - y_g_norm)
def bernoulli_nll(x, y): """Calculate negative log-likelihood of Bernoulli distribution. This function calculates negative log-likelihood on a Bernoulli distribution. .. math:: -B(x; p) = -\\sum_i {x_i \\log(p_i) + (1 - x_i)\\log(1 - p_i)}, where :math:`p = \\sigma(y)`, and :math:`\\sigma(\\cdot)` is a sigmoid funciton. .. note:: As this funtion uses a sigmoid function, you can pass a result of fully-connected layer (that means :class:`Linear`) to this function directly. Args: x (~chainer.Variable): Input variable. y (~chainer.Variable): A variable representing the parameter of Bernoulli distribution. Returns: ~chainer.Variable: A variable representing negative log-likelihood. """ assert isinstance(x, variable.Variable) assert isinstance(y, variable.Variable) return F.sum(F.softplus(-y)) + F.sum(y) - F.sum(y * x)
def __call__(self, x): """ Parameters ----------------- x: Variable Shape is 784 in case of MNIST """ # Reset mid outputs mid_outputs = self.mid_outputs = [] h = x for fc, bn in zip(self.fc_layers.values(), self.bn_layers.values()): z = fc(h) z_bn = bn(z, self.test) h = self.act(z_bn) shape = z.data.shape batch = shape[0] m, _ = F.broadcast(*[F.sum(z, 0) / batch, z]) v, _ = F.broadcast(*[F.sum((z - m) ** 2, 0) / batch, z]) #TODO: Add non-BN output mid_outputs.append((z - m) / v ) return h
def __call__(self, d_gen, d=None): bs_gen = d_gen[0] if d: bs = d[0] return F.sum(F.log(d)) / bs + F.sum(F.log(1 - d_gen)) / bs_gen else: return F.sum(F.log(1 - d_gen)) / bs_gen
def solve(self, x_seq, pos, neg, train=True, variablize=False, onebyone=True): if variablize:# If arguments are just arrays (not variables), make them variables x_seq = [chainer.Variable(x, volatile=not train) for x in x_seq] x_seq = [F.dropout(x, ratio=self.dropout_ratio, train=train) for x in x_seq] pos = self.act1(self.W_candidate( F.dropout(chainer.Variable(pos, volatile=not train), ratio=self.dropout_ratio, train=train))) neg = self.act1(self.W_candidate( F.dropout(chainer.Variable(neg, volatile=not train), ratio=self.dropout_ratio, train=train))) if onebyone and train: target_x_seq = [self.act1(self.W_candidate(x)) for x in x_seq[:4]]# 1,2,3,4,5-th targets onebyone_loss = 0. self.LSTM.reset_state() for i, x in enumerate(x_seq): h = self.LSTM( F.dropout(x, ratio=self.dropout_ratio, train=train) ) if onebyone and train and target_x_seq[i+1:]: pos_score, neg_score = self.calculate_score(h, target_x_seq[i+1:], neg, multipos=True) onebyone_loss += F.relu( self.margin - pos_score + neg_score ) pos_score, neg_score = self.calculate_score(h, pos, neg) accum_loss = F.relu( self.margin - pos_score + neg_score ) TorFs = sum(accum_loss.data < self.margin) if onebyone and train: return F.sum(accum_loss) + F.sum(onebyone_loss), TorFs else: return F.sum(accum_loss), TorFs
def loss_dis(self, dis, y_fake, y_real): batchsize = len(y_fake) L1= F.sum(F.softplus(-y_real)) / batchsize L2 = F.sum(F.softplus(y_fake)) / batchsize loss = L1 + L2 train_loss_dis.append(loss) return loss
def loss_dis(self, dis, y_in, y_out): batchsize,_,w,h = y_in.data.shape L1 = F.sum(F.softplus(-y_in)) / batchsize / w / h L2 = F.sum(F.softplus(y_out)) / batchsize / w / h loss = L1 + L2 chainer.report({'loss': loss}, dis) return loss
def __call__(self, y, hiddens=None, scale=True): ne_loss = 0 # NE for hiddens if hiddens is not None: for h in hiddens: h_normalized = F.softmax(h) h_log_softmax = F.log_softmax(h) n = h.data.shape[0] l = - F.sum(h_normalized * h_log_softmax) / n if scale: d = np.prod(h.data.shape[1:]) l = l / d ne_loss += l # NE for output y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) n = y.data.shape[0] l = - F.sum(y_normalized * y_log_softmax) / n if scale: d = np.prod(y.data.shape[1:]) l = l / d ne_loss += l return ne_loss
def loss_dis(self, dis, y_fake, y_real): batchsize = len(y_fake) L1 = F.sum(F.softplus(-y_real)) / batchsize L2 = F.sum(F.softplus(y_fake)) / batchsize loss = L1 + L2 chainer.report({'loss': loss}, dis) return loss
def channel_normalize(x, test=False): s0, s1, s2, s3 = x.data.shape cavg = F.reshape(F.sum(x, axis=1) / s1, (s0, 1, s2, s3)) xavg = F.concat(s1 * [cavg]) cvar = F.reshape(F.sum((x - xavg) ** 2, axis=1) / s1, (s0, 1, s2, s3)) xvar = F.concat(s1 * [cvar]) return (x - xavg) / (xvar + 1e-5) ** 0.5
def test_backward_case1(self): vertices = [ [-0.9, -0.9, 2.], [-0.8, 0.8, 1.], [0.8, 0.8, 0.5]] faces = [[0, 1, 2]] renderer = neural_renderer.Renderer() renderer.image_size = 64 renderer.anti_aliasing = False renderer.perspective = False renderer.camera_mode = 'none' vertices = cp.array(vertices, 'float32') faces = cp.array(faces, 'int32') vertices, faces = utils.to_minibatch((vertices, faces)) vertices = chainer.Variable(vertices) images = renderer.render_depth(vertices, faces) loss = cf.sum(cf.square(images[0, 15, 20] - 1)) loss.backward() grad = vertices.grad.get() grad2 = np.zeros_like(grad) for i in range(3): for j in range(3): eps = 1e-3 vertices2 = vertices.data.copy() vertices2[i, j] += eps images = renderer.render_depth(vertices2, faces) loss2 = cf.sum(cf.square(images[0, 15, 20] - 1)) grad2[i, j] = ((loss2 - loss) / eps).data.get() chainer.testing.assert_allclose(grad, grad2, atol=1e-3)
def tv_norm(self, x): diffh = self.tvh( F.reshape(x, (3, 1, self.args.in_size, self.args.in_size))) diffw = self.tvw( F.reshape(x, (3, 1, self.args.in_size, self.args.in_size))) tv = (F.sum(diffh ** 2) + F.sum(diffw ** 2)) ** (self.args.beta / 2.) return tv
def loss_dis2(self, dis2, y_in, y_out): batchsize,_,w,h = y_in.data.shape L1 = F.sum(F.softplus(-y_in)) / batchsize / w / h L2 = F.sum(F.softplus(y_out)) / batchsize / w / h loss = L1 + L2 #chainer.report({'loss': loss}, dis2) #print("dis2", {'loss': loss}) return loss
def cosine_similarity(x, y, eps=1e-6): n1, n2, n3 = x.data.shape _, m2, _ = y.data.shape z = F.batch_matmul(x, y, transb=True) x2 = F.broadcast_to(F.reshape(F.sum(x * x, axis=2), (n1, n2, 1)), (n1, n2, m2)) y2 = F.broadcast_to(F.reshape(F.sum(y * y, axis=2), (n1, 1, m2)), (n1, n2, m2)) z /= F.exp(F.log(x2 * y2 + eps) / 2) return z
def __call__(self, x, y): h = F.sigmoid(self.l1_(x)) coef = F.softmax(self.coef_(h)) mean = F.reshape(self.mean_(h), (-1,self.NUM_MIXTURE,self.OUT_DIM)) logvar = self.logvar_(h) mean, y = F.broadcast(mean, F.reshape(y, (-1,1,self.OUT_DIM))) return F.sum( coef*F.exp(-0.5*F.sum((y-mean)**2, axis=2)*F.exp(-logvar))/ ((2*np.pi*F.exp(logvar))**(0.5*self.OUT_DIM)),axis=1)
def logli(self, a): a = F.cast(a, np.float32) # transform back to standard normal zs = (a - self.means) * F.exp(-self.log_stds) # density of standard normal: f(z) = (2*pi*det|Σ|)^(-n/2) * exp(-|x|^2/2) # the return value should be log f(z) return - F.sum(self.log_stds, axis=-1) - \ 0.5 * F.sum(F.square(zs), axis=-1) - \ 0.5 * self.means.shape[-1] * np.log(2 * np.pi)
def __accuracy(self, y, t): xp = self.xp b, c, n = y.data.shape v = np.arange(c, dtype=np.float32).reshape((1, -1, 1)).repeat(b, axis=0).repeat(n, axis=2) v = Variable(xp.asarray(v), volatile=True) r = F.sum(v * F.softmax(Variable(y.data, volatile=True)), axis=1) c = Variable(t.data >= 0, volatile=True) t = Variable(t.data.astype(np.float32), volatile=True) r = F.where(c, r, t) return F.sum(((r - t) * self.rating_unit) ** 2)
def norm_by_freq(self, freq): word_embs = self.W mean = F.sum(freq * word_embs, axis=0, keepdims=True) mean = F.broadcast_to(mean, word_embs.shape) var = F.sum(freq * ((word_embs - mean) ** 2), axis=0, keepdims=True) var = F.broadcast_to(var, word_embs.shape) stddev = F.sqrt(1e-6 + var) word_embs_norm = (word_embs - mean) / stddev return word_embs_norm
def __call__(self, d_x_gen, d_x_real=None): bs_d_x_gen = d_x_gen.shape[0] if d_x_real is not None: bs_d_x_real = d_x_real.shape[0] loss = F.sum(d_x_real) / bs_d_x_real - F.sum(d_x_gen) / bs_d_x_gen return - loss # to minimize else: loss = F.sum(d_x_gen) / bs_d_x_gen return - loss # to minimize (reverse trick)
def __call__(self, x, z, test=False): if self.nolin: h = x else: h = self.lin(x) mu = F.sum(h, axis=0)/h.data.shape[0] self.mu = F.broadcast(F.reshape(mu, (1,h.data.shape[1])),h)[0] vr = (F.sum((h-self.mu)*(h-self.mu), axis=0)/h.data.shape[0])**0.5 self.vr = F.broadcast(F.reshape(vr, (1,h.data.shape[1])),h)[0] bnh = (h-self.mu)/(self.vr+1e-7) return self.comb(bnh, z)
def forward(self, ids, bow): bow, ids = utils.move(self.xp, bow, ids) proportions = self.proportions(ids) ld = dirichlet_likelihood(proportions) doc = F.matmul(F.softmax(proportions), self.factors()) logp = F.dropout(self.embedding(doc)) # loss = -F.sum(bow * F.log_softmax(logp)) sources, targets, counts = [], [], [] lpi = F.sum(bow * F.log_softmax(logp), axis=1) loss = -F.sum(lpi) return loss, ld
def __call__(self, d_x_gen, d_x_real=None): bs_d_x_gen = d_x_gen.shape[0] if d_x_real is not None: bs_d_x_real = d_x_real.shape[0] loss = F.sum(F.square(d_x_real - 1)) / bs_d_x_real /2 \ + F.sum(F.square(d_x_gen)) / bs_d_x_gen / 2 return loss else: loss = F.sum(F.square(d_x_gen - 1)) / bs_d_x_gen / 2 return loss
def __call__(self, d_x_gen, d_x=None): #TODO: reverse trick bs_d_x_gen = d_x_gen.shape[0] if d_x is not None: bs_d_x = d_x.shape[0] loss = F.sum(F.log(F.sigmoid(d_x))) / bs_d_x \ + F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen return - loss # to minimize else: loss = F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen return loss
def __call__(self, x, eta, test=False): h = self.lin(x) mu = F.sum(h, axis=0)/h.data.shape[0] self.mu = F.broadcast(F.reshape(mu, (1,h.data.shape[1])),h)[0] vr = (F.sum((h-self.mu)*(h-self.mu), axis=0)/h.data.shape[0])**0.5 self.vr = F.broadcast(F.reshape(vr, (1,h.data.shape[1])),h)[0] bnh = (h-self.mu)/(self.vr+1e-7) z = bnh + xp.random.randn(x.data.shape[0], self.n_out)*eta if self.act is None: return z, F.broadcast(self.gamma.W, z)[0]*(z + F.broadcast(self.beta.W, z)[0]) else: return z, self.act(F.broadcast(self.gamma.W, z)[0]*(z + F.broadcast(self.beta.W, z)[0]))
def __call__(self, x): q_z = self.encoder(x) z = q_z.sample(self.k) p_x = self.decoder(z) p_z = self.prior() reconstr = F.mean(F.sum(p_x.log_prob( F.broadcast_to(x[None, :], (self.k,) + x.shape)), axis=-1)) kl_penalty = F.mean(F.sum(chainer.kl_divergence(q_z, p_z), axis=-1)) loss = - (reconstr - self.beta * kl_penalty) reporter.report({'loss': loss}, self) reporter.report({'reconstr': reconstr}, self) reporter.report({'kl_penalty': kl_penalty}, self) return loss
def __call__(self, y0, y1): bs = y0.data.shape[0] d = np.prod(y0.data.shape[1:]) y0_softmax = F.softmax(y0) y1_softmax = F.softmax(y1) y0_log_softmax = F.log_softmax(y0) y1_log_softmax = F.log_softmax(y1) kl0 = F.sum(y0_softmax * (y0_log_softmax - y1_log_softmax)) / bs / d kl1 = F.sum(y1_softmax * (y1_log_softmax - y0_log_softmax)) / bs / d return (kl0 + kl1) / 2
def loss_gen(self, gen, y_fake, mse): batchsize = len(y_fake) loss = mse + self.alpha * F.sum(F.softplus(-y_fake)) / batchsize chainer.report({'loss': loss}, gen) return loss
def add(self, r_hat): self.sum += F.sum(r_hat).data self.n += r_hat.shape[0] return self.sum / self.n
def clustering_loss(x, t, gamma, T=5): """Clustering loss function for metric learning. Args: x (~chainer.Variable): Feature vectors. t (~chainer.Variable): Class labels corresponding to x. gamma (~float): Hyperparameter gamma. T (int): Maximum number of iterations in Algorithm 2. Returns: ~chainer.Variable: Loss value. See: `Learnable Structured Clustering Framework for Deep Metric Learning \ <https://arxiv.org/abs/1612.01213>`_ """ if not isinstance(x, chainer.Variable): x = chainer.Variable(x) if not isinstance(t, chainer.Variable): t = chainer.Variable(t) t_cpu = chainer.cuda.to_cpu(t.data).ravel() batch_size = len(t.data) num_classes = len(np.unique(t_cpu)) v = list(range(batch_size)) s = [] # First, search the sub-optimal solution y_PAM of the clustering. # Note that this computation is done outside the computational graph. # Find an initial medoids of S_PAM by Algorithm 1 in the paper. D = distance_matrix(x.data) D = cuda.to_cpu(D) for _ in range(num_classes): # find an element in v which maximise a_function a_best = -np.inf for i in v: distances = D[s + [i]] g_s = distances.argmin(axis=0) f = -distances[g_s, range(batch_size)].sum() if f + gamma < a_best: # skip if this is hopeless to be the best continue delta = 1.0 - normalized_mutual_info_score(t_cpu, g_s) a = f + gamma * delta if a > a_best: a_best = a i_best = i s.append(i_best) v.remove(i_best) # In order to speed-up by making skip to calculate NMI more frequently, # sort v in descending order by distances to their nearest medoid D_min = D[s].min(0) # distance to the nearest medoid for each point sorted_order = np.argsort(D_min[v])[::-1] v = np.array(v)[sorted_order].tolist() # Refine S_PAM by Algorithm 2 a_previous = a_best for t in range(T): np.random.shuffle(s) y_pam = np.array(s)[D[s].argmin(axis=0)] # since a column of D may have multiple zeros due to numerical errors, # ensure y_pam[j] == j, for each j \in s y_pam[s] = s for k in copy.copy(s): js = np.argwhere(y_pam == k).ravel() if len(js) == 1: continue D_k = D[:, js][js] fs = -D_k.sum(axis=1) j_max = fs.argmax() f_max = fs[j_max] s_except_k = copy.copy(s) s_except_k.remove(k) a_best = -np.inf for j, f in zip(js, fs): if f + gamma < f_max: continue g_s_j = D[s_except_k + [j]].argmin(axis=0) delta = 1.0 - normalized_mutual_info_score(t_cpu, g_s_j) a = f + gamma * delta if a > a_best: a_best = a j_best = j s = s_except_k + [j_best] # stop if the score did not improve from the previous step distances = D[s] g_s = distances.argmin(axis=0) f = -distances[g_s, range(batch_size)].sum() delta = 1.0 - normalized_mutual_info_score(t_cpu, g_s) a = f + gamma * delta if a == a_previous: break a_previous = a s_pam = s # Here, compute the loss with S_PAM and its corresponding delta. y_pam = np.asarray(s_pam)[D[s_pam].argmin(axis=0)].tolist() y_star = np.empty_like(t_cpu) for c in np.unique(t_cpu): js = np.argwhere(t_cpu == c).ravel() # indexes of examples of class c D_c = D[:, js][js] fs = D_c.sum(axis=1) y_star_c = js[fs.argmin()] y_star[js] = y_star_c f = -F.sum(F.batch_l2_norm_squared(x - x[y_pam])) f_tilde = -F.sum(F.batch_l2_norm_squared(x - x[y_star])) loss = F.relu(f + gamma * delta - f_tilde) return loss
def loss_gen(self, gen, y_fake): batchsize = len(y_fake) #G(D(z), z)->1 loss = F.sum(F.softplus(-y_fake)) / batchsize chainer.report({'loss': loss}, gen) return loss
def loss_gen(self, gen, y_fake): batchsize = y_fake.data.shape[0] loss = F.sum(F.softplus(-y_fake)) / batchsize chainer.report({'loss': loss}, gen) return loss
def compute_kld(self, p, q): assert p.shape[0] == q.shape[0] return functions.reshape( functions.sum( p * (functions.log(p + 1e-16) - functions.log(q + 1e-16)), axis=1), (-1, 1))
def __call__(self, mu, sigma_2, log_sigma_2): bs = mu.shape[0] kl = F.sum(1 + log_sigma_2 - mu**2 - sigma_2) / 2 / bs # Explicit KL form kl = -kl # maximize kl means to minimize -kl return kl
def main(id): model_path = "/efs/fMRI_AE/SimpleFCAE_E32D32/model/model_iter_108858" gpu = 0 get_device_from_id(gpu).use() """NibDataset def __init__(self, directory: str, crop: list): """ crop = [[9, 81], [11, 99], [0, 80]] test_dataset = NibDataset("/data/test", crop=crop) mask = load_mask_nib("/data/mask/average_optthr.nii", crop) """SimpleFCAE_E32D32 def __init__(self, mask, r: int, in_mask: str, out_mask: str): """ model = Model(mask, 2, "mask", "mask") load_npz(model_path, model) model.to_gpu() # feature_idx = 0 # feature_idx = (0, 4, 5, 5) # == [0, 9/2, 11/2, 10/2] # feature_idx = (0, 1, 1, 1) feature_idx = (0, 2, 7, 4) resample_size = 100 batch_size = 10 noise_level = 0.2 for i in range(len(test_dataset)): if i % 8 != id: continue print("{:4}/{:4}".format(i, len(test_dataset))) subject = test_dataset.get_subject(i) frame = test_dataset.get_frame(i) test_img = xp.asarray(test_dataset[i]) resample_remain = resample_size resample_processed = 0 ret = xp.zeros(test_img.shape) while resample_remain > 0: batch_size_this_loop = min(batch_size, resample_remain) resample_remain -= batch_size_this_loop batch = xp.broadcast_to( test_img, chain((batch_size_this_loop, ), test_img.shape)) sigma = noise_level / (xp.max(test_img) - xp.min(test_img)) batch += sigma * xp.random.randn(*batch.shape) x = Variable(batch) feature = model.extract(x) assert feature.shape == (batch_size, 1, 9, 11, 10) feature = F.sum(feature, axis=0) assert feature.shape == (1, 9, 11, 10) feature = F.get_item(feature, feature_idx) feature.backward() grad = xp.mean(x.grad, axis=0) ret = (ret * resample_processed + grad * batch_size_this_loop) / ( resample_processed + batch_size_this_loop) model.cleargrads() xp.save( "/efs/fMRI_AE/SimpleFCAE_E32D32/grad/sensitivity_map_feature_{}_{}_{}_subject{:03d}_frame{:03d}" .format(feature_idx[1], feature_idx[2], feature_idx[3], subject, frame), ret)
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0): """Compute AttLoc forward layer. Args: enc_hs (chainer.Variable | N-dimensional array): Input variable from encoders. dec_z (chainer.Variable | N-dimensional array): Input variable of decoder. att_prev (chainer.Variable | None): Attention weight. scaling (float): Scaling weight to make attention sharp. Returns: chainer.Variable: Weighted sum over flames. chainer.Variable: Attention weight. """ batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = self.mlp_enc(self.enc_h, n_batch_axes=2) if dec_z is None: dec_z = chainer.Variable( self.xp.zeros((batch, self.dunits), dtype=np.float32) ) else: dec_z = dec_z.reshape(batch, self.dunits) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # att_prev: utt x frame -> utt x 1 x 1 x frame # -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv(att_prev.reshape(batch, 1, 1, self.h_length)) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = self.mlp_att(att_conv, n_batch_axes=2) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to( F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape ) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze( self.gvec( F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled), n_batch_axes=2 ), axis=2, ) # Applying a minus-large-number filter # to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum( self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1 ) return c, w
def forward(self, xs): h = self.l(xs) h = self.half(h) return F.sum(chainer.as_variable(h))
def loss_func_dcgan_dis_fake(h): return F.sum(F.softplus(h)) / np.prod(h.data.shape)
def adv_loss(y, alpha=1.0): a, p, n = F.split_axis(y, 3, axis=0) distance = -F.sum((a - p)**2.0, axis=1) + F.sum( (a - n)**2.0, axis=1) - alpha return F.average(F.relu(distance)) / 2
def __call__(self, obs): action_distrib = self.pi(obs) action_value = self.q(obs) v = F.sum(action_distrib.all_prob * action_value.q_values, axis=1) return action_distrib, action_value, v
def mse_gen(self, x, m, c): return F.sum( F.batch_l2_norm_squared(F.broadcast_to(m, x.shape) * (c - x))) / len(x)
def __call__(self, input_x, t, ignore_t): if isinstance(input_x, chainer.Variable): device = cuda.get_device(input_x.data) else: device = cuda.get_device(input_x) xp = self.predictor.xp with device: output = self.predictor(input_x) batch_size, _, grid_h, grid_w = output.shape self.seen += batch_size x, y, w, h, conf, prob = F.split_axis(F.reshape( output, (batch_size, self.predictor.n_boxes, self.predictor.n_classes + 5, grid_h, grid_w)), (1, 2, 3, 4, 5), axis=2) x = F.sigmoid(x) y = F.sigmoid(y) conf = F.sigmoid(conf) prob = F.transpose(prob, (0, 2, 1, 3, 4)) prob = F.softmax(prob) # training labels tw = np.zeros(w.shape, dtype=np.float32) th = np.zeros(h.shape, dtype=np.float32) tx = np.tile(0.5, x.shape).astype(np.float32) ty = np.tile(0.5, y.shape).astype(np.float32) # set low learning rate for bounding boxes that have no object if self.seen < self.unstable_seen: box_learning_scale = np.tile(0.1, x.shape).astype(np.float32) else: box_learning_scale = np.tile(0, x.shape).astype(np.float32) tconf = np.zeros(conf.shape, dtype=np.float32) conf_learning_scale = np.tile(0.1, conf.shape).astype(np.float32) tprob = prob.data.copy() x_shift = np.broadcast_to(np.arange(grid_w, dtype=np.float32), x.shape[1:]) y_shift = np.broadcast_to( np.arange(grid_h, dtype=np.float32).reshape(grid_h, 1), y.shape[1:]) w_anchor = np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 0], (self.predictor.n_boxes, 1, 1, 1)), w.shape[1:]) h_anchor = np.broadcast_to( np.reshape( np.array(self.anchors, dtype=np.float32)[:, 1], (self.predictor.n_boxes, 1, 1, 1)), h.shape[1:]) x_data = cuda.to_cpu(x.data) y_data = cuda.to_cpu(y.data) w_data = cuda.to_cpu(w.data) h_data = cuda.to_cpu(h.data) best_ious = [] for batch in range(batch_size): n_truth_boxes = len(t[batch]) box_x = (x_data[batch] + x_shift) / grid_w box_y = (y_data[batch] + y_shift) / grid_h box_w = np.exp(w_data[batch]) * w_anchor / grid_w box_h = np.exp(h_data[batch]) * h_anchor / grid_h ious = [] for truth_index in range(n_truth_boxes): truth_box_x = np.broadcast_to( np.array(t[batch][truth_index]["x"], dtype=np.float32), box_x.shape) truth_box_y = np.broadcast_to( np.array(t[batch][truth_index]["y"], dtype=np.float32), box_y.shape) truth_box_w = np.broadcast_to( np.array(t[batch][truth_index]["w"], dtype=np.float32), box_w.shape) truth_box_h = np.broadcast_to( np.array(t[batch][truth_index]["h"], dtype=np.float32), box_h.shape) ious.append( multi_box_iou( Box(box_x, box_y, box_w, box_h), Box(truth_box_x, truth_box_y, truth_box_w, truth_box_h))) if len(ious) > 0: ious = np.asarray(ious) best_ious.append(np.max(ious, axis=0)) else: best_ious.append(np.zeros_like(x_data[0])) best_ious = np.array(best_ious) # keep confidence of anchor that has more confidence than threshold tconf[best_ious > self.thresh] = conf.data.get()[ best_ious > self.thresh] conf_learning_scale[best_ious > self.thresh] = 0 # ignored regions are not considered either positive or negative best_ious = [] for batch in range(batch_size): n_truth_boxes = len(ignore_t[batch]) box_x = (x_data[batch] + x_shift) / grid_w box_y = (y_data[batch] + y_shift) / grid_h box_w = np.exp(w_data[batch]) * w_anchor / grid_w box_h = np.exp(h_data[batch]) * h_anchor / grid_h ious = [] for truth_index in range(n_truth_boxes): truth_box_x = np.broadcast_to( np.array(ignore_t[batch][truth_index]["x"], dtype=np.float32), box_x.shape) truth_box_y = np.broadcast_to( np.array(ignore_t[batch][truth_index]["y"], dtype=np.float32), box_y.shape) truth_box_w = np.broadcast_to( np.array(ignore_t[batch][truth_index]["w"], dtype=np.float32), box_w.shape) truth_box_h = np.broadcast_to( np.array(ignore_t[batch][truth_index]["h"], dtype=np.float32), box_h.shape) ious.append( multi_box_iou( Box(box_x, box_y, box_w, box_h), Box(truth_box_x, truth_box_y, truth_box_w, truth_box_h))) if len(ious) > 0: ious = np.asarray(ious) best_ious.append(np.max(ious, axis=0)) else: best_ious.append(np.zeros_like(x_data[0])) best_ious = np.array(best_ious) # do not update confidence for ignored regions tconf[best_ious > self.ignore_thresh] = conf.data.get()[ best_ious > self.ignore_thresh] conf_learning_scale[best_ious > self.ignore_thresh] = 0 # adjust x, y, w, h, conf, prob of anchor boxes that have objects abs_anchors = self.anchors / np.array([grid_w, grid_h]) for batch in range(batch_size): for truth_box in t[batch]: truth_w = int(float(truth_box["x"]) * grid_w) truth_h = int(float(truth_box["y"]) * grid_h) truth_n = 0 best_iou = 0.0 for anchor_index, abs_anchor in enumerate(abs_anchors): iou = box_iou( Box(0, 0, float(truth_box["w"]), float(truth_box["h"])), Box(0, 0, abs_anchor[0], abs_anchor[1])) if best_iou < iou: best_iou = iou truth_n = anchor_index box_learning_scale[batch, truth_n, :, truth_h, truth_w] = 1.0 tx[batch, truth_n, :, truth_h, truth_w] = float(truth_box["x"]) * grid_w - truth_w ty[batch, truth_n, :, truth_h, truth_w] = float(truth_box["y"]) * grid_h - truth_h tw[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box["w"]) / abs_anchors[truth_n][0]) th[batch, truth_n, :, truth_h, truth_w] = np.log( float(truth_box["h"]) / abs_anchors[truth_n][1]) tprob[batch, :, truth_n, truth_h, truth_w] = 0 tprob[batch, int(truth_box["label"]), truth_n, truth_h, truth_w] = 1 full_truth_box = Box(float(truth_box["x"]), float(truth_box["y"]), float(truth_box["w"]), float(truth_box["h"])) predicted_box = Box( (x[batch][truth_n][0][truth_h][truth_w].data.get() + truth_w) / grid_w, (y[batch][truth_n][0][truth_h][truth_w].data.get() + truth_h) / grid_h, np.exp( w[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][0], np.exp( h[batch][truth_n][0][truth_h][truth_w].data.get()) * abs_anchors[truth_n][1]) predicted_iou = box_iou(full_truth_box, predicted_box) tconf[batch, truth_n, :, truth_h, truth_w] = predicted_iou conf_learning_scale[batch, truth_n, :, truth_h, truth_w] = 10.0 tx = cuda.to_gpu(tx) ty = cuda.to_gpu(ty) tw = cuda.to_gpu(tw) th = cuda.to_gpu(th) tconf = cuda.to_gpu(tconf) tprob = cuda.to_gpu(tprob) box_learning_scale = cuda.to_gpu(box_learning_scale) conf_learning_scale = cuda.to_gpu(conf_learning_scale) x_loss = F.sum((tx - x)**2 * box_learning_scale) / 2 y_loss = F.sum((ty - y)**2 * box_learning_scale) / 2 w_loss = F.sum((tw - w)**2 * box_learning_scale) / 2 h_loss = F.sum((th - h)**2 * box_learning_scale) / 2 c_loss = F.sum((tconf - conf)**2 * conf_learning_scale) / 2 p_loss = F.sum((tprob - prob)**2) / 2 return x_loss, y_loss, w_loss, h_loss, c_loss, p_loss
def _readout_sum(x): y = functions.sum(x, axis=1) # sum along node axis return y
def handle_gpu_batch(self, epoch, batches_passed, batch_start_time, k, g, \ att_images, att_images_mix, \ att_images_multi, att_images_multi_mix, \ att_images_real_multi, \ joints, batch_one_hot, \ objects, objs_one_hot, sentence, descriptions, descriptions_one_hot): xp = cuda.cupy cuda.get_device(g).use() self.enc_models[k].cleargrads() self.att_enc_models[k].cleargrads() self.att_gen_models[k].cleargrads() self.dis_models[k].cleargrads() self.mdn_models[k].cleargrads() self.reset_all([self.mdn_models[k]]) gpu_batch_size = self.batch_size // GPU.num_gpus att_images = att_images[k * gpu_batch_size:(k + 1) * gpu_batch_size] att_images_mix = att_images_mix[k * gpu_batch_size:(k + 1) * gpu_batch_size] att_images_multi = att_images_multi[k * gpu_batch_size:(k + 1) * gpu_batch_size] att_images_multi_mix = att_images_multi_mix[k * gpu_batch_size:(k + 1) * gpu_batch_size] att_images_real_multi = att_images_real_multi[k * gpu_batch_size:(k + 1) * gpu_batch_size] objects = np.asarray(objects[k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.int32) objects = np.repeat(objects[:, np.newaxis], self.sequence_size, axis=1) objs_one_hot = np.asarray(objs_one_hot[k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.float32) objs_one_hot = np.repeat(objs_one_hot[:, np.newaxis], self.sequence_size, axis=1) descriptions = np.asarray(descriptions[k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.int32) descriptions = np.repeat(descriptions[:, np.newaxis], self.sequence_size, axis=1) descriptions_one_hot = np.asarray( descriptions_one_hot[k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.float32) descriptions_one_hot = np.repeat(descriptions_one_hot[:, np.newaxis], self.sequence_size, axis=1) att_images = att_images.transpose(1, 0, 2, 3, 4) att_images_mix = att_images_mix.transpose(1, 0, 2, 3, 4) att_images_multi = att_images_multi.transpose(1, 0, 2, 3, 4) att_images_multi_mix = att_images_multi_mix.transpose(1, 0, 2, 3, 4) att_images_real_multi = att_images_real_multi.transpose(1, 0, 2, 3, 4) objects = objects.transpose(1, 0) objs_one_hot = objs_one_hot.transpose(1, 0, 2) descriptions = descriptions.transpose(1, 0) descriptions_one_hot = descriptions_one_hot.transpose(1, 0, 2) objects = np.squeeze( np.reshape(objects, (self.sequence_size * gpu_batch_size, -1))) objs_one_hot = np.squeeze( np.reshape(objs_one_hot, (self.sequence_size * gpu_batch_size, -1))) descriptions = np.squeeze( np.reshape(descriptions, (self.sequence_size * gpu_batch_size, -1))) descriptions_one_hot = np.squeeze( np.reshape(descriptions_one_hot, (self.sequence_size * gpu_batch_size, -1))) joints = joints.transpose(1, 0, 2) joints = np.asarray(joints[:, k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.float32) joints = Variable(cuda.to_gpu(joints, g)) batch_one_hot = np.asarray(batch_one_hot[k * gpu_batch_size:(k + 1) * gpu_batch_size], dtype=np.float32) batch_one_hot = np.repeat(batch_one_hot[np.newaxis], self.sequence_size, axis=0) batch_one_hot = np.reshape(batch_one_hot, (self.sequence_size * gpu_batch_size, 4)) batch_one_hot = Variable(cuda.to_gpu(batch_one_hot, g)) att_images = np.reshape( att_images, (-1, self.num_channels, self.image_size, self.image_size)) x_in_att = Variable( cuda.to_gpu(np.asarray(att_images, dtype=np.float32), g)) att_images_mix = np.reshape( att_images_mix, (-1, self.num_channels, self.image_size, self.image_size)) x_in_att_mix = Variable( cuda.to_gpu(np.asarray(att_images_mix, dtype=np.float32), g)) att_images_multi = np.reshape( att_images_multi, (-1, self.num_channels, self.image_size, self.image_size)) x_in_att_multi = Variable( cuda.to_gpu(np.asarray(att_images_multi, dtype=np.float32), g)) att_images_multi_mix = np.reshape( att_images_multi_mix, (-1, self.num_channels, self.image_size, self.image_size)) x_in_att_multi_mix = Variable( cuda.to_gpu(np.asarray(att_images_multi_mix, dtype=np.float32), g)) att_images_real_multi = np.reshape( att_images_real_multi, (-1, self.num_channels, self.image_size, self.image_size)) x_in_att_real_multi = Variable( cuda.to_gpu(np.asarray(att_images_real_multi, dtype=np.float32), g)) objects_var = Variable(cuda.to_gpu(objects, g)) desc_var = Variable(cuda.to_gpu(descriptions, g)) objects_hot_var = Variable(cuda.to_gpu(objs_one_hot, g)) desc_hot_var = Variable(cuda.to_gpu(descriptions_one_hot, g)) att0, s0, c0 = self.enc_models[k](x_in_att, objects_hot_var, desc_hot_var, train=True) m_att0, m_s0, m_c0 = self.enc_models[k](x_in_att_mix, objects_hot_var, desc_hot_var, train=True) att00, s00, c00 = self.enc_models[k](x_in_att_multi, objects_hot_var, desc_hot_var, train=True) m_att00, m_s00, m_c00 = self.enc_models[k](x_in_att_multi_mix, objects_hot_var, desc_hot_var, train=True) real_att0, real_s0, real_c0 = self.enc_models[k](x_in_att_real_multi, objects_hot_var, desc_hot_var, train=True) l1_norm_att = F.sum(att0) l1_norm_att += F.sum(m_att0) l1_norm_att += F.sum(att00) l1_norm_att += F.sum(m_att00) l1_norm_att += F.sum(real_att0) l1_norm_att /= 5 * gpu_batch_size * self.sequence_size * self.att_size * self.att_size # att0 = F.normalize(att0, axis=1) att0 = F.reshape(att0, (-1, 1, self.att_size, self.att_size)) att0 = F.resize_images(att0, (self.image_size, self.image_size)) # m_att0 = F.normalize(m_att0, axis=1) m_att0 = F.reshape(m_att0, (-1, 1, self.att_size, self.att_size)) m_att0 = F.resize_images(m_att0, (self.image_size, self.image_size)) # att00 = F.normalize(att00, axis=1) att00 = F.reshape(att00, (-1, 1, self.att_size, self.att_size)) att00 = F.resize_images(att00, (self.image_size, self.image_size)) # m_att00 = F.normalize(m_att00, axis=1) m_att00 = F.reshape(m_att00, (-1, 1, self.att_size, self.att_size)) m_att00 = F.resize_images(m_att00, (self.image_size, self.image_size)) # real_att0 = F.normalize(real_att0, axis=1) real_att0 = F.reshape(real_att0, (-1, 1, self.att_size, self.att_size)) real_att0 = F.resize_images(real_att0, (self.image_size, self.image_size)) att_classification = F.softmax_cross_entropy( s0, objects_var) + F.softmax_cross_entropy(c0, desc_var) att_classification += F.softmax_cross_entropy( m_s0, objects_var) + F.softmax_cross_entropy(m_c0, desc_var) att_classification += F.softmax_cross_entropy( s00, objects_var) + F.softmax_cross_entropy(c00, desc_var) att_classification += F.softmax_cross_entropy( m_s00, objects_var) + F.softmax_cross_entropy(m_c00, desc_var) att_classification += F.softmax_cross_entropy( real_s0, objects_var) + F.softmax_cross_entropy(real_c0, desc_var) att_classification /= 10 g1 = x_in_att * att0 g2 = x_in_att_mix * m_att0 g3 = x_in_att_multi * att00 g4 = x_in_att_multi_mix * m_att00 g5 = x_in_att_real_multi * real_att0 att_similarity = F.mean_squared_error(g1, g2) att_similarity += F.mean_squared_error(g3, g4) cir_z, cir_mean, cir_var, _ = self.att_enc_models[k](g1, train=True) cir_z_m, cir_mean_m, cir_var_m, _ = self.att_enc_models[k](g2, train=True) cir_z0, cir_mean0, cir_var0, _ = self.att_enc_models[k](g3, train=True) cir_z0_m, cir_mean0_m, cir_var0_m, _ = self.att_enc_models[k]( g4, train=True) cir_z_real, cir_mean_real, cir_var_real, _ = self.att_enc_models[k]( g5, train=True) l_prior = F.gaussian_kl_divergence(cir_mean, cir_var) / (5 * self.normer) l_prior += F.gaussian_kl_divergence(cir_mean_m, cir_var_m) / (5 * self.normer) l_prior += F.gaussian_kl_divergence(cir_mean0, cir_var0) / (5 * self.normer) l_prior += F.gaussian_kl_divergence(cir_mean0_m, cir_var0_m) / (5 * self.normer) l_prior += F.gaussian_kl_divergence(cir_mean_real, cir_var_real) / (5 * self.normer) l_prior /= 5 cir_x0 = self.att_gen_models[k](cir_z, train=True) cir_m_x0 = self.att_gen_models[k](cir_z_m, train=True) cir_x00 = self.att_gen_models[k](cir_z0, train=True) cir_m_x00 = self.att_gen_models[k](cir_z0_m, train=True) cir_real_x0 = self.att_gen_models[k](cir_z_real, train=True) reconstruction_loss = F.mean_squared_error( x_in_att, cir_x0[:, :3]) + F.mean_squared_error( x_in_att, cir_m_x0[:, :3]) reconstruction_loss += F.mean_squared_error( x_in_att_multi, cir_x00[:, :3]) + F.mean_squared_error( x_in_att_multi, cir_m_x00[:, :3]) reconstruction_loss_att = F.mean_squared_error( g1, cir_x0[:, 3:]) + F.mean_squared_error(g2, cir_m_x0[:, 3:]) reconstruction_loss_att += F.mean_squared_error( g3, cir_x00[:, 3:]) + F.mean_squared_error(g4, cir_m_x00[:, 3:]) reconstruction_loss /= 4 reconstruction_loss_att /= 4 reconstruction_loss_att *= 100 s3, c3, l3 = self.dis_models[k](cir_x0[:, :3], train=True) m_s3, m_c3, m_l3 = self.dis_models[k](cir_m_x0[:, :3], train=True) s30, c30, l30 = self.dis_models[k](cir_x00[:, :3], train=True) m_s30, m_c30, m_l30 = self.dis_models[k](cir_m_x00[:, :3], train=True) m_s30_real, m_c30_real, m_l30_real = self.dis_models[k]( cir_real_x0[:, :3], train=True) l_dis_rec_3 = F.softmax_cross_entropy( s3, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) m_l_dis_rec_3 = F.softmax_cross_entropy( m_s3, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) l_dis_rec3 = F.softmax_cross_entropy( s30, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) m_l_dis_rec3 = F.softmax_cross_entropy( m_s30, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) real_l_dis_rec3 = F.softmax_cross_entropy( m_s30_real, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) l_dis_rec_3 += F.softmax_cross_entropy( c3, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) m_l_dis_rec_3 += F.softmax_cross_entropy( m_c3, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) l_dis_rec3 += F.softmax_cross_entropy( c30, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) m_l_dis_rec3 += F.softmax_cross_entropy( m_c30, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) real_l_dis_rec3 += F.softmax_cross_entropy( m_c30_real, Variable( cuda.to_gpu( xp.zeros(gpu_batch_size * self.sequence_size).astype( np.int32), g))) l_dis_fake = (l_dis_rec_3 + m_l_dis_rec_3 + l_dis_rec3 + m_l_dis_rec3 + real_l_dis_rec3) / 10 s2, c2, l2 = self.dis_models[k](x_in_att, train=True) s22, c22, l22 = self.dis_models[k](x_in_att_multi, train=True) l_dis_real = F.softmax_cross_entropy(s2, objects_var) l_dis_real += F.softmax_cross_entropy(s22, objects_var) l_dis_real += F.softmax_cross_entropy(c2, desc_var) l_dis_real += F.softmax_cross_entropy(c22, desc_var) l_dis_real /= 4 l_feature_similarity = F.mean_squared_error( l3, l2) + F.mean_squared_error(m_l3, l2) l_feature_similarity += F.mean_squared_error( l30, l22) + F.mean_squared_error(m_l30, l22) l_feature_similarity /= 8 text_encoding = F.concat( (batch_one_hot, objects_hot_var, desc_hot_var), axis=-1) text_encoding = F.reshape(text_encoding, (self.sequence_size, gpu_batch_size, -1)) z_seq = F.reshape( cir_z, (self.sequence_size, gpu_batch_size, self.latent_size)) z_seq_mix = F.reshape( cir_z_m, (self.sequence_size, gpu_batch_size, self.latent_size)) mdn_loss, _ = self.mdn_models[k](task_encoding=text_encoding[0], image_encoding=z_seq[:-1], data_out=joints[1:], return_sample=False) mdn_loss_mix, _ = self.mdn_models[k](task_encoding=text_encoding[0], image_encoding=z_seq_mix[:-1], data_out=joints[1:], return_sample=False) robot_loss = (mdn_loss + mdn_loss_mix) / 2 dis_loss = (l_dis_fake + 10 * l_dis_real) / (gpu_batch_size * self.sequence_size) loss_classifier = att_classification loss_enc = 10 * l_prior + 10 * l_feature_similarity + 10 * att_similarity + 2 * l1_norm_att loss_gen = 2 * l_feature_similarity + 200 * reconstruction_loss - dis_loss loss_dis = dis_loss self.enc_models[k].cleargrads() self.att_enc_models[k].cleargrads() self.att_gen_models[k].cleargrads() self.mdn_models[k].cleargrads() loss_net = loss_enc + loss_gen + loss_classifier + robot_loss / 5 loss_net.backward() g1.unchain_backward() g2.unchain_backward() g3.unchain_backward() g4.unchain_backward() g5.unchain_backward() reconstruction_loss_att.backward() cir_x0.unchain_backward() cir_m_x0.unchain_backward() cir_x00.unchain_backward() cir_m_x00.unchain_backward() cir_real_x0.unchain_backward() self.dis_models[k].cleargrads() loss_dis.backward() sys.stdout.write( '\r' + str(batches_passed) + '/' + str(1000) + ' time: {0:0.2f}, enc:{1:0.4f}, gen:{2:0.4f}, dis:{3:0.4f}, l_prior:{4:0.4f}, fea:{5:0.4f}, att_sim:{6:0.4f}, rec:{7:0.4f}, att_rec:{8:0.4f}, att_class:{9:0.4f}, norm:{10:0.4f}, mdn_loss:{11:0.4f}' .format(time.time() - batch_start_time, float(loss_enc.data), float(loss_gen.data), float(loss_dis.data), float(l_prior.data), float(l_feature_similarity.data), float(att_similarity.data), float( reconstruction_loss.data), float(reconstruction_loss_att.data), float(att_classification.data), float(l1_norm_att.data), float(robot_loss.data))) sys.stdout.flush() # important
def original(self, hs, ys): '''Decoder forward :param Variable hs: :param Variable ys: :return: ''' self.loss = None # prepare input and output word sequences with sos/eos IDs eos = self.xp.array([self.eos], 'i') sos = self.xp.array([self.sos], 'i') ys_in = [F.concat([sos, y], axis=0) for y in ys] ys_out = [F.concat([y, eos], axis=0) for y in ys] # padding for ys with -1 # pys: utt x olen pad_ys_in = F.pad_sequence(ys_in, padding=self.eos) pad_ys_out = F.pad_sequence(ys_out, padding=-1) # get dim, length info batch = pad_ys_out.shape[0] olength = pad_ys_out.shape[1] logging.info(self.__class__.__name__ + ' input lengths: ' + str(self.xp.array([h.shape[0] for h in hs]))) logging.info(self.__class__.__name__ + ' output lengths: ' + str(self.xp.array([y.shape[0] for y in ys_out]))) # initialization c_list = [None] # list of cell state of each layer z_list = [None] # list of hidden state of each layer for l in six.moves.range(1, self.dlayers): c_list.append(None) z_list.append(None) att_w = None z_all = [] self.att.reset() # reset pre-computation of h # pre-computation of embedding eys = self.embed(pad_ys_in) # utt x olen x zdim eys = F.separate(eys, axis=1) # loop for an output sequence for i in six.moves.range(olength): att_c, att_w = self.att(hs, z_list[0], att_w) if i > 0 and random.random() < self.sampling_probability: logging.info(' scheduled sampling ') z_out = self.output(z_all[-1]) z_out = F.argmax(F.log_softmax(z_out), axis=1) z_out = self.embed(z_out) ey = F.hstack((z_out, att_c)) # utt x (zdim + hdim) else: ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim) c_list[0], z_list[0] = self.lstm0(c_list[0], z_list[0], ey) for l in six.moves.range(1, self.dlayers): c_list[l], z_list[l] = self['lstm%d' % l](c_list[l], z_list[l], z_list[l - 1]) z_all.append(z_list[-1]) z_all = F.reshape(F.stack(z_all, axis=1), (batch * olength, self.dunits)) # compute loss y_all = self.output(z_all) self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out)) # -1: eos, which is removed in the loss computation self.loss *= (np.mean([len(x) for x in ys_in]) - 1) acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1) logging.info('att loss:' + str(self.loss.data)) # show predicted character sequence for debug if self.verbose > 0 and self.char_list is not None: y_hat = F.reshape(y_all, (batch, olength, -1)) y_true = pad_ys_out for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data): if i == MAX_DECODER_OUTPUT: break idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1) idx_true = y_true_[y_true_ != -1] seq_hat = [self.char_list[int(idx)] for idx in idx_hat] seq_true = [self.char_list[int(idx)] for idx in idx_true] seq_hat = "".join(seq_hat).replace('<space>', ' ') seq_true = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%d]: " % i + seq_true) logging.info("prediction [%d]: " % i + seq_hat) if self.labeldist is not None: if self.vlabeldist is None: self.vlabeldist = chainer.Variable(self.xp.asarray(self.labeldist)) loss_reg = - F.sum(F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)) / len(ys_in) self.loss = (1. - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg return self.loss, acc
def compute_entropy(self, p): if p.ndim == 2: return -functions.sum(p * functions.log(p + 1e-16), axis=1) return -functions.sum(p * functions.log(p + 1e-16))
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0): '''AttLoc forward :param enc_hs: :param dec_z: :param att_prev: :param scaling: :return: ''' batch = len(enc_hs) # pre-compute all h outside the decoder loop if self.pre_compute_enc_h is None: self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim self.h_length = self.enc_h.shape[1] # utt x frame x att_dim self.pre_compute_enc_h = linear_tensor(self.mlp_enc, self.enc_h) if dec_z is None: dec_z = chainer.Variable( self.xp.zeros((batch, self.dunits), dtype=np.float32)) else: dec_z = F.reshape(dec_z, (batch, self.dunits)) # initialize attention weight with uniform dist. if att_prev is None: att_prev = [ self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32) for hh in enc_hs ] att_prev = [chainer.Variable(att) for att in att_prev] att_prev = F.pad_sequence(att_prev) # TODO(watanabe) use <chainer variable>.reshpae(), instead of F.reshape() # att_prev: utt x frame -> utt x 1 x 1 x frame -> utt x att_conv_chans x 1 x frame att_conv = self.loc_conv( F.reshape(att_prev, (batch, 1, 1, self.h_length))) # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2) # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim att_conv = linear_tensor(self.mlp_att, att_conv) # dec_z_tiled: utt x frame x att_dim dec_z_tiled = F.broadcast_to(F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape) # dot with gvec # utt x frame x att_dim -> utt x frame # TODO(watanabe) use batch_matmul e = F.squeeze(linear_tensor( self.gvec, F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled)), axis=2) # Applying a minus-large-number filter to make a probability value zero for a padded area # simply degrades the performance, and I gave up this implementation # Apply a scaling to make an attention sharp w = F.softmax(scaling * e) # weighted sum over flames # utt x hdim c = F.sum(self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1) return c, w
def loss_enc(self, enc, y_real): batchsize = len(y_real) #G(x, E(x))->1 loss = F.sum(F.softplus(y_real)) / batchsize chainer.report({'loss': loss}, enc) return loss
def sigmoid_cross_entropy(x, z): return F.sum(F.relu(x) - x * z + F.log(1 + F.exp(-abs(x))))
def __call__(self, x, test=False): x = self.encode(x, test) x = F.sum(x, axis=0) / x.shape[0] return F.squeeze(x)
dis_total_loss += dis_loss1.data dis_loss1.backward() optimizer_dis.update() optimizer_cla.update() #-------------------- # generated data #-------------------- cla.cleargrads() # classifier dis.cleargrads() # discriminator gen.cleargrads() # generator rcls = np.random.choice([0, 1], bs2) x3 = gen(rcls) yy3 = F.softmax(cla.fwd(x3)) cla_loss6 = F.sum(F.matmul(yy3, F.transpose(yy3))) cls_total_loss += cla_loss6.data cla_loss6.backward() ## CLS loss 1 optimizer_cla.update() optimizer_gen.update() dis.cleargrads() # discriminator gen.cleargrads() # generator x3 = gen(rcls) yg0 = rcls.reshape(len(rcls), 1).astype(dtype='float32') yx3 = F.hstack([yg0, x3]) one3 = Variable(np.ones(len(yx3)).astype(dtype='int32')) # dis_loss3 = dis(yx3,one3,train=False) ## DIS loss 3
def main(): try: os.mkdir(args.snapshot_directory) except: pass comm = chainermn.create_communicator() device = comm.intra_rank cuda.get_device(device).use() xp = cp images = [] files = os.listdir(args.dataset_path) files.sort() subset_size = int(math.ceil(len(files) / comm.size)) files = deque(files) files.rotate(-subset_size * comm.rank) files = list(files)[:subset_size] for filename in files: image = np.load(os.path.join(args.dataset_path, filename)) image = image / 256 images.append(image) print(comm.rank, files) images = np.vstack(images) images = images.transpose((0, 3, 1, 2)).astype(np.float32) train_dev_split = 0.9 num_images = images.shape[0] num_train_images = int(num_images * train_dev_split) num_dev_images = num_images - num_train_images images_train = images[:num_train_images] # To avoid OpenMPI bug # multiprocessing.set_start_method("forkserver") # p = multiprocessing.Process(target=print, args=("", )) # p.start() # p.join() hyperparams = HyperParameters() hyperparams.chz_channels = args.chz_channels hyperparams.generator_generation_steps = args.generation_steps hyperparams.generator_share_core = args.generator_share_core hyperparams.generator_share_prior = args.generator_share_prior hyperparams.generator_share_upsampler = args.generator_share_upsampler hyperparams.generator_downsampler_channels = args.generator_downsampler_channels hyperparams.inference_share_core = args.inference_share_core hyperparams.inference_share_posterior = args.inference_share_posterior hyperparams.inference_downsampler_channels = args.inference_downsampler_channels hyperparams.batch_normalization_enabled = args.enable_batch_normalization hyperparams.use_gru = args.use_gru hyperparams.no_backprop_diff_xr = args.no_backprop_diff_xr if comm.rank == 0: hyperparams.save(args.snapshot_directory) hyperparams.print() if args.use_gru: model = GRUModel(hyperparams, snapshot_directory=args.snapshot_directory) else: model = LSTMModel(hyperparams, snapshot_directory=args.snapshot_directory) model.to_gpu() optimizer = AdamOptimizer(model.parameters, lr_i=args.initial_lr, lr_f=args.final_lr, beta_1=args.adam_beta1, communicator=comm) if comm.rank == 0: optimizer.print() num_pixels = images.shape[1] * images.shape[2] * images.shape[3] dataset = draw.data.Dataset(images_train) iterator = draw.data.Iterator(dataset, batch_size=args.batch_size) num_updates = 0 for iteration in range(args.training_steps): mean_kld = 0 mean_nll = 0 mean_mse = 0 start_time = time.time() for batch_index, data_indices in enumerate(iterator): x = dataset[data_indices] x += np.random.uniform(0, 1 / 256, size=x.shape) x = to_gpu(x) z_t_param_array, x_param, r_t_array = model.sample_z_and_x_params_from_posterior( x) loss_kld = 0 for params in z_t_param_array: mean_z_q, ln_var_z_q, mean_z_p, ln_var_z_p = params kld = draw.nn.functions.gaussian_kl_divergence( mean_z_q, ln_var_z_q, mean_z_p, ln_var_z_p) loss_kld += cf.sum(kld) loss_sse = 0 for r_t in r_t_array: loss_sse += cf.sum(cf.squared_error(r_t, x)) mu_x, ln_var_x = x_param loss_nll = cf.gaussian_nll(x, mu_x, ln_var_x) loss_nll /= args.batch_size loss_kld /= args.batch_size loss_sse /= args.batch_size loss = args.loss_beta * loss_nll + loss_kld + args.loss_alpha * loss_sse model.cleargrads() loss.backward(loss_scale=optimizer.loss_scale()) optimizer.update(num_updates, loss_value=float(loss.array)) num_updates += 1 mean_kld += float(loss_kld.data) mean_nll += float(loss_nll.data) mean_mse += float(loss_sse.data) / num_pixels / ( hyperparams.generator_generation_steps - 1) printr( "Iteration {}: Batch {} / {} - loss: nll_per_pixel: {:.6f} - mse: {:.6f} - kld: {:.6f} - lr: {:.4e}" .format( iteration + 1, batch_index + 1, len(iterator), float(loss_nll.data) / num_pixels + math.log(256.0), float(loss_sse.data) / num_pixels / (hyperparams.generator_generation_steps - 1), float(loss_kld.data), optimizer.learning_rate)) if comm.rank == 0 and batch_index > 0 and batch_index % 100 == 0: model.serialize(args.snapshot_directory) if comm.rank == 0: model.serialize(args.snapshot_directory) if comm.rank == 0: elapsed_time = time.time() - start_time print( "\r\033[2KIteration {} - loss: nll_per_pixel: {:.6f} - mse: {:.6f} - kld: {:.6f} - lr: {:.4e} - elapsed_time: {:.3f} min" .format( iteration + 1, mean_nll / len(iterator) / num_pixels + math.log(256.0), mean_mse / len(iterator), mean_kld / len(iterator), optimizer.learning_rate, elapsed_time / 60))
def square_norm(x,y): return F.sum((F.log(x)-F.log(y))**2)/batchsize
def __call__(self, x): # Compute parameters for q(z|x, a) encoding_time_1 = time.time() qmu_a, qln_var_a = self.encode_a(x) encoding_time_1 = float(time.time() - encoding_time_1) a_enc = F.gaussian(qmu_a, qln_var_a) encoding_time_2 = time.time() qmu_z, qln_var_z = self.encode_z(x, a_enc) encoding_time_2 = float(time.time() - encoding_time_2) encoding_time = encoding_time_1 + encoding_time_2 decoding_time_average = 0. self.kl = 0 self.logp = 0 logp_a_xz = 0 logp_x_z = 0 logp_z = 0 logq_a_x = 0 logq_z_ax = 0 current_temperature = min(self.temperature['value'], 1.0) self.temperature['value'] += self.temperature['increment'] for j in xrange(self.num_zsamples): # z ~ q(z|x, a) z = F.gaussian(self.qmu_z, self.qln_var_z) # Compute p(x|z) decoding_time = time.time() pmu_a, pln_var_a = self.decode_a(z, x) p_ber_prob_logit = self.decode(z) decoding_time = time.time() - decoding_time decoding_time_average += decoding_time logp_a_xz += gaussian_logp(a_enc, pmu_a, pln_var_a) logp_x_z += bernoulli_logp(x, p_ber_prob_logit) logp_z += current_temperature * gaussian_logp0(z) logq_a_x += gaussian_logp(a_enc, qmu_a, qln_var_a) logq_z_ax += current_temperature * gaussian_logp( z, qmu_z, qln_var_z) logp_a_xz /= self.num_zsamples logp_x_z /= self.num_zsamples logp_z /= self.num_zsamples logq_a_x /= self.num_zsamples logq_z_ax /= self.num_zsamples decoding_time_average /= self.num_zsamples self.logp /= self.num_zsamples self.obj_batch = logp_a_xz + logp_x_z + logp_z - logq_a_x - logq_z_ax self.kl = logq_z_ax - logp_z self.logp = logp_x_z self.timing_info = np.array([encoding_time, decoding_time_average]) batch_size = self.obj_batch.shape[0] self.obj = -F.sum(self.obj_batch) / batch_size return self.obj
channel_observed = get_normalized_image_variable(t + dt, w) if channel_observed is None: no_image = True continue channel_observeds.append(channel_observed) if no_image: continue img_input = F.concat(channel_inputs) img_observed = F.concat(channel_observeds) img_predicted = predictor(img_input) loss = F.sum(abs(img_predicted - img_observed)) predictor.cleargrads() loss.backward() optimizer_p.update() """ Train the generator and discriminator """ t2 = t no_missing_image = True img_forecast = img_input if epoch >= start_dcgan_at_epoch: for i in range(1, 7): t2 = t + i * dt img_forecast = predictor(img_forecast) channel_futures = []
def loss_gen(self, gen, y_fake): batchsize = len(y_fake) loss = F.sum(-y_fake) / batchsize chainer.reporter.report({'loss': loss}, gen) return loss
def forward(self, xs, ys): batch = len(xs) xs = [xp.array(x[::-1]) for x in xs] exs = sequence_embed(self.embed_x, xs) # None represents a zero vector in an encoder. hx, cx, xs_states = self.encoder(None, None, exs) hx = F.reshape(F.transpose(hx, (1, 0, 2)), (batch, self.n_layers, self.n_units * 2)) cx = F.reshape(F.transpose(cx, (1, 0, 2)), (batch, self.n_layers, self.n_units * 2)) hx = [d for d in hx] cx = [d for d in cx] evs = [self.embed_y(xp.array([i])) for i in range(self.embed_y_size)] concat_ys_outs = [[] for _ in ys] #print(len(ys)) att_os = [] for i, (y, hxs) in enumerate(zip(ys, xs_states)): concat_oss = [] def rec_LSTM(node, eidx, nhxncx): nonlocal i (nhx, ncx) = nhxncx ntype, nchoice, children = node #eidx = self.embed_idx[ntype][ppos] ev = evs[eidx] #print(ev.shape) #print(nhx.shape,ncx.shape) thx, tcx, nos = self.decoder(nhx, ncx, [ev]) nos = nos[0] #wnos = self.W[ntype](nos) if len(self.trans_data[ntype]) > 1: concat_oss.append(nos) concat_ys_outs[i].append(self.choice_idx[ntype][nchoice]) # otherwise, we don't have to train. for j, ch in enumerate(children): #print(ntype,nchoice,i) teidx = self.embed_idx[ntype][nchoice][j] rec_LSTM(ch, teidx, (thx, tcx)) nhx, ncx = hx[i], cx[i] ncx = F.reshape(ncx, (ncx.shape[0], 1, ncx.shape[1])) nhx = F.reshape(nhx, (nhx.shape[0], 1, nhx.shape[1])) assert y[0] == self.type_size - 1 ridx = self.embed_root_idx rec_LSTM(y, ridx, (nhx, ncx)) #print(concat_oss[0].shape,len(concat_oss)) yh = F.concat(concat_oss, axis=0) #print(yh.shape) ch = self.att(xs_states[i], yh) att_os.append(F.tanh(self.Wc(F.concat([ch, yh], axis=1)))) concat_os = F.concat(att_os, axis=0) concat_ys_out = list(map(lambda d: xp.array(d), concat_ys_outs)) concat_ys_out = F.concat(concat_ys_out, axis=0) loss = F.sum( F.softmax_cross_entropy( self.Ws(concat_os), concat_ys_out, reduce='no')) / batch chainer.report({'loss': loss}, self) #print(loss) return loss