def _length_aware_softmax(e, l0, l1, xp): # e: (B, T0, T1) bs, t0, t1 = e.shape l0 = l0.reshape((bs, 1, 1)) l1 = l1.reshape((bs, 1, 1)) mask0 = (xp.tile(xp.arange(t0).reshape(1, t0, 1), (bs, 1, 1)) < l0).astype(e.dtype) mask1 = (xp.tile(xp.arange(t1).reshape(1, t1, 1), (bs, 1, 1)) < l1).astype(e.dtype) mask = (xp.matmul(mask0, mask1.swapaxes(1, 2))).astype(np.bool) # mask: (B, T0, T1) mask = chainer.Variable(mask) padding = chainer.Variable(xp.zeros(e.shape, dtype=e.dtype)) e_max = F.max(e, keepdims=True) e_masked = F.where(mask, e, padding) e_masked = e_masked - F.broadcast_to(e_max, e.shape) e_sum0 = F.reshape(F.logsumexp(e_masked, axis=1), (bs, 1, t1)) e_sum1 = F.reshape(F.logsumexp(e_masked, axis=2), (bs, t0, 1)) s1 = F.exp(e_masked - F.broadcast_to(e_sum0, e.shape)) s2 = F.exp(e_masked - F.broadcast_to(e_sum1, e.shape)) s1 = F.where(mask, s1, padding) s2 = F.where(mask, s2, padding) return s1, s2
def compute_discriminator_loss(self, image_real, image_fake, image_labeled, label): # predict prediction_real = self.discriminator(image_real) prediction_fake = self.discriminator(image_fake) prediction_labeled = self.discriminator(image_labeled) # discriminator loss prediction_real_lse = cf.logsumexp(prediction_real, axis=1) prediction_fake_lse = cf.logsumexp(prediction_fake, axis=1) loss_discriminator = ( 0.5 * cf.sum(cf.softplus(prediction_real_lse)) / prediction_real_lse.size + 0.5 * cf.sum(-prediction_real_lse) / prediction_real_lse.size + 0.5 * cf.sum(cf.softplus(prediction_fake_lse)) / prediction_fake_lse.size) # classifier loss loss_classifier = cf.softmax_cross_entropy(prediction_labeled, label) loss = loss_discriminator + loss_classifier chainer.reporter.report( { 'loss_discriminator': loss_discriminator, 'loss_classifier': loss_classifier }, self) return loss
def update_core(self): gen_optimizer = self.get_optimizer('gen') dis_optimizer = self.get_optimizer('dis') label_batch = self.get_iterator('labeled').next() x_labeled, true_label = zip(*label_batch) x_labeled = self.xp.asarray(x_labeled).astype("f") true_label = self.xp.asarray(true_label).astype("i") unlabel_batch = self.get_iterator("unlabeled").next() batchsize = len(unlabel_batch) x_unlabeled = self.xp.asarray(unlabel_batch).astype("f") y_with_label = self.dis(x_labeled) y_with_unlabel = self.dis(x_unlabeled) # real_feature = self.dis.feature.data z = self.gen.make_hidden(batchsize) x_fake = self.gen(z) y_fake = self.dis(x_fake) # fake_feature = self.dis.feature log_sum_y_fake = F.logsumexp(y_fake, axis=1) # loss_gen = F.mean(F.softplus(log_sum_y_fake)) loss_gen_softplus = F.softplus(log_sum_y_fake) loss_gen = -F.mean(log_sum_y_fake - loss_gen_softplus) # loss_feature = F.mean_squared_error(fake_feature, real_feature) self.gen.cleargrads() loss_gen.backward() # loss_feature.backward() gen_optimizer.update() chainer.reporter.report({'gen/loss': loss_gen}) # chainer.reporter.report({'gen/loss_f': loss_feature}) log_sum_y_real = F.logsumexp(y_with_unlabel, axis=1) loss_classify = self.loss_label(y_with_label, true_label) loss_unlabel = log_sum_y_real - F.softplus(log_sum_y_real) z = self.gen.make_hidden(batchsize) x_fake = self.gen(z) y_fake = self.dis(x_fake.data) loss_from_gen = F.logsumexp(y_fake, axis=1) loss_from_gen = F.softplus(loss_from_gen) loss_dis = F.mean(-loss_unlabel + loss_from_gen) self.dis.cleargrads() loss_dis.backward() loss_classify.backward() dis_optimizer.update() chainer.reporter.report({'dis/loss': loss_dis})
def __call__(self, x): batchsize = x.shape[0] iwae = IWAEObjective(self.encode, self.decode, self.num_zsamples) logw = iwae.compute_logw(x) # (num_zsamples,batchsize) obj_elbo = -iwae.compute_elbo(logw) M = self.logB.shape[1] # number of subsets n = self.num_zsamples # (n,M,batchsize) logw = F.broadcast_to(F.reshape(logw, (n, 1, batchsize)), (n, M, batchsize)) logB = F.broadcast_to(F.reshape(self.logB, (n, M, 1)), (n, M, batchsize)) R = F.logsumexp(logw + logB, axis=0) # (M,batchsize) logp = F.matmul(self.A, R, transa=True) # (batchsize,) obj_c = logp - F.broadcast_to(F.mean(logp), logp.shape) obj_var = F.sum(obj_c * obj_c) / (batchsize - 1) obj = -F.mean(logp) reporter.report({ 'obj': obj, 'obj_var': obj_var, 'obj_elbo': obj_elbo }, self) return obj
def angular_mc_loss(f, f_p, alpha=45, in_degree=True): ''' Args: f (chainer.Variable or xp.npdarray): Anchor vectors. Each vectors in f must be l2 normalized. f_p (chainer.Variable or xp.npdarray): Positive vectors. Each vectors in f must be l2 normalized. ''' xp = cuda.get_array_module(f) if in_degree: alpha = np.deg2rad(alpha) sq_tan_alpha = np.tan(alpha)**2 n_pairs = len(f) # first and second term of f_{a,p,n} term1 = 4 * sq_tan_alpha + matmul(f + f_p, transpose(f_p)) term2 = 2 * (1 + sq_tan_alpha) * F.sum(f * f_p, axis=1, keepdims=True) # term2 = 2 * (1 + sq_tan_alpha) * F.batch_matmul(f, f_p, transa=True).reshape(n_pairs, 1) f_apn = term1 - F.broadcast_to(term2, (n_pairs, n_pairs)) # multiply zero to diagonal components of f_apn mask = xp.ones_like(f_apn.data) - xp.eye(n_pairs, dtype=f.dtype) f_apn = f_apn * mask return F.average(F.logsumexp(f_apn, axis=1))
def check_forward(self, x_data, axis=None): x = chainer.Variable(x_data) y = functions.logsumexp(x, axis=axis) self.assertEqual(y.data.dtype, self.dtype) y_expect = numpy.log(numpy.exp(self.x).sum(axis=axis)) gradient_check.assert_allclose( y_expect, y.data, **self.check_forward_option)
def check_forward(self, x_data, axis=None): x = chainer.Variable(x_data) y = functions.logsumexp(x, axis=axis) self.assertEqual(y.data.dtype, self.dtype) y_expect = numpy.log(numpy.exp(self.x).sum(axis=axis)) testing.assert_allclose( y_expect, y.data, **self.check_forward_option)
def proxy_nca_loss(x, proxy, labels): """Proxy-NCA loss function. Args: x (:class:`~chainer.Variable`): L2 normalized anchor points whose shape is (B, D), where B is the batch size and D is the number of dimensions of feature vector. proxy (:class:`~chainer.Variable` or :class:`~chainer.Parameter`): Proxies whose shape is (K, D), where K is the number of classes in the dataset. labels (:class:`numpy.ndarray`): Class labels associated to x. The shape is (B,) and dtype is int. Note that the class IDs must be 0, 1, ..., K-1. Returns: :class:`~chainer.Variable`: Loss value. See: `No Fuss Distance Metric Learning using Proxies \ <http://openaccess.thecvf.com/content_ICCV_2017/papers/\ Movshovitz-Attias_No_Fuss_Distance_ICCV_2017_paper.pdf>`_ """ proxy = F.normalize(proxy) distance = squared_distance_matrix(x, proxy) d_posi = distance[np.arange(len(x)), labels] # For each row, remove one element corresponding to the positive distance B, K = distance.shape # batch size and the number of classes mask = np.tile(np.arange(K), (B, 1)) != labels[:, None] d_nega = distance[mask].reshape(B, K - 1) log_denominator = F.logsumexp(-d_nega, axis=1) loss = d_posi + log_denominator return F.average(loss)
def calculate_gaussian_loss(self, y, t): xp = chainer.cuda.get_array_module(t) if xp != numpy: xp.cuda.Device(t.device).use() nr_mix = y.shape[1] // 3 logits = y[:, :nr_mix] means = y[:, nr_mix:2 * nr_mix] log_scales = y[:, 2 * nr_mix:3 * nr_mix] log_scales = F.maximum( log_scales, self.scalar_to_tensor(log_scales, self.log_scale_min)) t = F.broadcast_to(t, means.shape) ditstribution = chainer.distributions.Normal( means, log_scale=log_scales) cdf_plus = ditstribution.cdf(t + 1 / (self.quantize - 1)) cdf_min = ditstribution.cdf(t - 1 / (self.quantize - 1)) probs = cdf_plus - cdf_min probs = F.maximum(probs, self.scalar_to_tensor(probs, 1e-12)) if nr_mix == 1: loss = -F.mean(F.log(probs)) else: log_probs = F.log_softmax(logits) + F.log(probs) loss = -F.mean(F.logsumexp(log_probs, axis=1)) return loss
def _elementwise_softmax_cross_entropy(x, t): assert x.shape[:-1] == t.shape p = F.reshape(F.select_item(F.reshape(x, (-1, x.shape[-1])), F.flatten(t)), t.shape) return F.logsumexp(x, axis=-1) - p
def check_double_backward(self, x_data, y_grad, x_grad_grad, axis=None): gradient_check.check_double_backward( lambda x: functions.logsumexp(x, axis), x_data, y_grad, x_grad_grad, dtype=numpy.float64, **self.check_double_backward_option)
def compute(self, observations): q_values = self.q_model(observations) values_out = F.expand_dims(self._tau * F.logsumexp(q_values / self._tau, axis=1), axis=1) values = F.broadcast_to(values_out, q_values.shape) pi_out = (q_values - values) / self._tau return pi_out, values_out
def __call__(self, x): # Obtain parameters for q(z|x) encoding_time = time.time() self.encode(x) encoding_time = float(time.time() - encoding_time) decoding_time_average = 0. xp = cuda.cupy self.importance_weights = 0 self.w_holder = [] self.kl = 0 self.logp = 0 for j in xrange(self.num_zsamples): # Sample z ~ q(z|x) z = F.gaussian(self.qmu, self.qln_var) # Compute log q(z|x) encoder_log = gaussian_logp(z, self.qmu, self.qln_var) # Obtain parameters for p(x|z) decoding_time = time.time() self.decode(z) decoding_time = time.time() - decoding_time decoding_time_average += decoding_time # Compute log p(x|z) decoder_log = gaussian_logp(x, self.pmu, self.pln_var) # Compute log p(z). The odd notation being used is to supply a mean of 0 and covariance of 1 prior_log = gaussian_logp(z, self.qmu*0, self.qln_var/self.qln_var) # Store the latest log weight' current_temperature = min(self.temperature['value'],1.0) self.w_holder.append(decoder_log + current_temperature*(prior_log - encoder_log)) # Store the KL and Logp equivalents. They are not used for computation but for recording and reporting. self.kl += (encoder_log-prior_log) self.logp += (decoder_log) self.temperature['value'] += self.temperature['increment'] # Compute w' for this sample (batch) logps = F.stack(self.w_holder) self.obj_batch = F.logsumexp(logps, axis=0) - np.log(self.num_zsamples) self.kl /= self.num_zsamples self.logp /= self.num_zsamples decoding_time_average /= self.num_zsamples batch_size = self.obj_batch.shape[0] self.obj = -F.sum(self.obj_batch)/batch_size self.timing_info = np.array([encoding_time,decoding_time_average]) return self.obj
def process_batch(self, x, y, one_hot, last_joint, return_sample=False, return_mean=True): xp = cuda.cupy x = F.dropout(x, ratio=0.5) x = F.concat((one_hot, x), axis=-1) x = self.ln1_(x) h0 = self.l1_(x) h = F.dropout(h0, ratio=0.5) h1 = self.ln2_(F.concat((x, h), axis=1)) h1 = self.l2_(h1) h = F.dropout(h1, ratio=0.5) h2 = self.ln3_(F.concat((x, h), axis=1)) h2 = self.l3_(h2) final_h = F.concat((h0, h1, h2), axis=-1) mu = self.mu_(final_h) mu = F.reshape(mu, (-1, self.future_out_dim, self.NUM_MIXTURE)) sigma_orig = self.sigma_(final_h) mixing_orig = self.mixing_(final_h) sigma = F.softplus(sigma_orig) mixing = F.softmax(mixing_orig) y = F.expand_dims(y, axis=2) y_broad = F.broadcast_to(y, mu.shape) normalizer = 2 * np.pi * sigma exponent = -0.5 * (1. / F.square(sigma)) * F.sum( (y_broad - mu)**2, axis=1) + F.log( mixing) - (self.future_out_dim * 0.5) * F.log(normalizer) cost = -F.logsumexp(exponent, axis=1) cost = F.mean(cost) #sampling if return_sample: mixing = mixing_orig * (1 + self.sampling_bias) sigma = F.softplus(sigma_orig - self.sampling_bias) mixing = F.softmax(mixing) argmax_mixing = F.argmax(mixing, axis=1) mixing_one_hot = xp.zeros(mixing.shape, dtype=xp.float32) mixing_one_hot[xp.arange(mixing.shape[0]), argmax_mixing.data] = 1 component_expanded = F.broadcast_to( F.expand_dims(mixing_one_hot, axis=1), mu.shape) component_mean = F.sum(mu * component_expanded, axis=2) if return_mean: return cost, component_mean component_std = F.sum(sigma * component, axis=2, keepdims=True) component_std = F.broadcast_to(component_std, component_mean.shape) sample = xp.random.normal(component_mean.data, component_std.data) return cost, sample return cost, None
def log_Z(self, x, y): A0 = self.unary_potential( x, Variable(-np.ones_like(x.data, dtype=np.float32))) I0 = self.site_I(x, y, -1) A1 = self.unary_potential( x, Variable(np.ones_like(x.data, dtype=np.float32))) I1 = self.site_I(x, y, 1) return F.logsumexp(F.concat((A0 + I0, A1 + I1), axis=0), axis=0)
def mixture_of_discretized_logistics_nll(x, y): """ Args: x: (b, c, n, n) y: (b, 10*n_mix, n, n) """ xp = get_array_module(x) n_mix = y.shape[1] // 10 logit_prob = y[:, :n_mix, :, :] y = F.reshape(y[:, n_mix:, :, :], x.shape + (n_mix * 3, )) mean = y[:, :, :, :, 0:n_mix] log_scale = y[:, :, :, :, n_mix:2 * n_mix] log_scale = F.maximum(log_scale, -7 * xp.ones(log_scale.shape, dtype='f')) coeff = F.tanh(y[:, :, :, :, 2 * n_mix:3 * n_mix]) x = xp.repeat(xp.expand_dims(x, 4), n_mix, 4) m1 = F.expand_dims(mean[:, 0, :, :, :], 1) m2 = F.expand_dims( mean[:, 1, :, :, :] + coeff[:, 0, :, :, :] * x[:, 0, :, :, :], 1) m3 = F.expand_dims( (mean[:, 2, :, :, :] + coeff[:, 1, :, :, :] * x[:, 0, :, :, :] + coeff[:, 2, :, :, :] * x[:, 1, :, :, :]), 1) mean = F.concat([m1, m2, m3]) centered_x = x - mean inv_std = F.exp(-log_scale) max_in = inv_std * (centered_x + 1. / 255.) cdf_max = F.sigmoid(max_in) min_in = inv_std * (centered_x - 1. / 255.) cdf_min = F.sigmoid(min_in) log_cdf_max = max_in - F.softplus(max_in) # 0 log_one_minus_cdf_min = -F.softplus(min_in) # 255 cdf_delta = cdf_max - cdf_min # 0 ~ 255 mid_in = inv_std * centered_x log_pdf_mid = mid_in - log_scale - 2. * F.softplus(mid_in) # mid log_prob = F.where( x < -0.999, log_cdf_max, F.where( x > 0.999, log_one_minus_cdf_min, F.where( cdf_delta.array > 1e-5, F.log( F.maximum(cdf_delta, xp.ones(cdf_delta.shape, dtype='f') * 1e-12)), log_pdf_mid - xp.log(127.5)))) log_prob = F.transpose(F.sum(log_prob, 1), (0, 3, 1, 2)) log_prob = log_prob + log_prob_from_logit(logit_prob) loss = F.logsumexp(log_prob, 1) loss = F.sum(loss, axis=(1, 2)) return -F.mean(loss)
def _instantaneousLoss(self, proj_array, input_labels, ind): """ Instantaneous MI estimate between projected features and labels based on non-parametric density estimates INPUT: proj_array - Variable of projected features array [num_trials x output_dim] input_labels - input feature labels array [num_trials x 1] ind - index of the instantaneous feature for mutual information estimation OUTPUT: Variable of negated instantaneous mutual information """ # Empirical class prior estimates num_classes = np.max(input_labels) + 1 num_samples = len(input_labels) - 1 obs_labels = [np.where(np.delete(input_labels, ind) == c)[0] for c in range(num_classes)] priors = [len(obs_labels[c]) / num_samples for c in range(num_classes)] # Class conditional kernel density estimate value components constants, energies, lse_energies = [], [], [] for c in range(num_classes): const, energy = self._kdeparts(proj_array[obs_labels[c]], proj_array[ind]) constants.append(const) energies.append(energy) lse_energies.append(F.logsumexp(energy).data) # Use the maximum logsumexp(energy) across classes for the exp-normalize trick max_index = lse_energies.index(max(lse_energies)) joint_prob = [priors[c] * constants[c] * F.exp(F.logsumexp(energies[c]) - F.logsumexp(energies[max_index])) for c in range(num_classes)] # Calculate entropy and conditional entropy for the stochastic MI estimate conditional_entropy_parts = [] entropy_parts = [] for c in range(num_classes): c_given_y = joint_prob[c] / sum(joint_prob) conditional_entropy_parts.append(c_given_y * (F.log(constants[c]) + F.logsumexp(energies[c]))) entropy_parts.append(priors[c] * constants[c] * F.exp(F.logsumexp(energies[c]))) conditional_entropy = sum(conditional_entropy_parts) entropy = F.log(sum(entropy_parts)) return entropy - conditional_entropy
def __call__(self, x, y): h = self.l1_(x) # h2 = self.l2_(h1) # return F.mean_squared_error(h2, y) sigma = F.softplus(self.sigma_(h)) mixing = F.softmax(self.mixing_(h)) mu = F.reshape(self.mu_(h), (-1, self.OUT_DIM, self.NUM_MIXTURE)) mu, y = F.broadcast(mu, F.reshape(y, (-1, self.OUT_DIM, 1))) exponent = -0.5 * (1. / sigma) * F.sum((y - mu)**2, axis=1) normalizer = 2 * np.pi * sigma exponent = exponent + F.log(mixing) - (self.OUT_DIM * .5) * F.log(normalizer) cost = -F.logsumexp(exponent) return cost
def get_elbo(self, x, k=None, with_ll=False): if not k: k = self.k q_z = self.encoder(x) z = q_z.sample(k) p_x = self.decoder(z, n_batch_axes=2) p_z = self.prior() reconstr = p_x.log_prob(F.broadcast_to(x[None, :], (k, ) + x.shape)) kl_penalty = q_z.log_prob(z) - p_z.log_prob(z) elbo_k = reconstr - kl_penalty elbo = F.mean(elbo_k) if with_ll: log_likelihood = F.mean(F.logsumexp(elbo_k, axis=0) - numpy.log(k)) return elbo, log_likelihood else: return elbo
def __call__(self, x): batchsize = x.shape[0] iwae = IWAEObjective(self.encode, self.decode, self.num_zsamples) logw = iwae.compute_logw(x) # (num_zsamples,batchsize) obj_elbo = -iwae.compute_elbo(logw) # Jackknife bias corrected logp estimate A = F.logsumexp(logw, axis=0) logp_iwae = A - math.log(self.num_zsamples) logp_iwae = F.sum(logp_iwae) / batchsize k = float(self.num_zsamples) wnorm = F.exp(logw - F.broadcast_to(A, logw.shape)) #wmax = F.max(wnorm) #print wmax #ess = F.sum(1.0 / F.sum(wnorm*wnorm, axis=0)) / batchsize #B = F.sum(F.log1p(-F.exp(logw - F.broadcast_to(A, logw.shape))), axis=0) #print logw B = F.sum(numfun.log1mexp(logw - F.broadcast_to(A, logw.shape) - 1.0e-6), axis=0) #print B logp_jk = A - ( (k - 1) / k) * B - k * math.log(k) + (k - 1) * math.log(k - 1) logp_jk_mean = F.sum(logp_jk) / batchsize obj = -logp_jk_mean correction = logp_jk_mean - logp_iwae # Variance computation obj_c = logp_jk - F.broadcast_to(logp_jk_mean, logp_jk.shape) obj_var = F.sum(obj_c * obj_c) / (batchsize - 1) reporter.report( { 'obj': obj, 'obj_var': obj_var, 'obj_elbo': obj_elbo, 'corr': correction }, self) return obj
def ais(decoder, X, M=32, T=100, steps=10, stepsize=0.1, sigma=1.0, encoder=None): xp = cupy.get_array_module(X) batchsize = X.shape[0] # number of samples in X nz = decoder.nz # number of latent dimensions # Sample initial z and initialize log weights if encoder == None: print "Using p(z)" zprior = ZPrior(nz) else: print "Using q(z|x)" zprior = ZEncoder(encoder, X) Z = zprior.sample(X, M) #logw = xp.zeros((M*batchsize,)) logw = zprior.initial_logw(X, Z) for t in xrange(2, T + 1): efun_cur = EnergyFunction(zprior, decoder, X, ais_beta_sigmoid(t, T)) efun_prev = EnergyFunction(zprior, decoder, X, ais_beta_sigmoid(t - 1, T)) accept_rate = leapfrog(efun_cur, Z, n=steps, leapfrog_eps=stepsize, moment_sigma=sigma) if t % 100 == 0: print "AIS t=%d accept rate %.3f" % (t, accept_rate) logw += efun_prev(Z).data - efun_cur(Z).data logw = F.reshape(logw, (M, batchsize)) logZ = F.logsumexp(logw, axis=0) - math.log(M) return logZ
def mellowmax(values, omega=1., axis=1): """Mellowmax function. This is a kind of softmax function that is, unlike the Boltzmann softmax, non-expansion. See: http://arxiv.org/abs/1612.05628 Args: values (Variable or ndarray): Input values. Mellowmax is taken along the second axis. omega (float): Parameter of mellowmax. axis (int): Axis along which mellowmax is taken. Returns: outputs (Variable) """ n = values.shape[axis] return (F.logsumexp(omega * values, axis=axis) - np.log(n)) / omega
def loss(self, x_loc, x_conf, t_loc, t_conf): xp = cuda.get_array_module(x_loc.data) pos = (t_conf.data > 0).flatten() if xp.logical_not(pos).all(): return 0, 0 x_loc = F.reshape(x_loc, (-1, 4)) t_loc = F.reshape(t_loc, (-1, 4)) loss_loc = F.huber_loss(x_loc, t_loc, 1) loss_loc = F.where(pos, loss_loc, xp.zeros_like(loss_loc.data)) loss_loc = F.sum(loss_loc) / pos.sum() hard_neg = self.mine_hard_negative(x_conf, t_conf).flatten() x_conf = F.reshape(x_conf, (-1, self.n_class + 1)) t_conf = F.flatten(t_conf) loss_conf = F.logsumexp(x_conf, axis=1) - F.select_item(x_conf, t_conf) loss_conf = F.where(xp.logical_or(pos, hard_neg), loss_conf, xp.zeros_like(loss_conf.data)) loss_conf = F.sum(loss_conf) / pos.sum() return loss_loc, loss_conf
def get_loss(self, x_t, y_t): x_t = Variable(x_t) y_t = Variable(y_t) y_t = F.reshape(y_t, (1, y_t.shape[0])) # normalize output score to avoid divergence y_t = F.normalize(y_t) self.model.zerograds() pred = self.model(x_t) # ---- start loss calculation ---- pred = F.reshape(pred, (pred.shape[1], pred.shape[0])) p_true = F.softmax(F.reshape(y_t, (y_t.shape[0], y_t.shape[1]))) xm = F.max(pred, axis=1, keepdims=True) logsumexp = F.logsumexp(pred, axis=1) #xm_broadcast = F.broadcast_to(xm,(xm.shape[0],pred.shape[1])) #logsumexp = F.reshape(xm,(xm.shape[0],)) + F.log(F.sum(F.exp(pred-xm_broadcast),axis=1)) logsumexp = F.broadcast_to(logsumexp, (xm.shape[0], pred.shape[1])) loss = -1 * F.sum(p_true * (pred - logsumexp)) trainres = ndcg(y_t.data, pred.data, self.n_thres_cand) #,nthres) if np.isnan(trainres): print y_t.data.max(), y_t.data.min() return loss, trainres
def __call__(self, x): # Compute q(z|x) qmu, qln_var = self.encode(x) batchsize = qmu.data.shape[0] # Perform unnormalized importance sampling logw = list() logpxz = list() for i in xrange(self.num_zsamples): # z ~ q(z|x) z = F.gaussian(qmu, qln_var) logqz = gaussian_logp_inst(z, qmu, qln_var) logpz = gaussian_logp01_inst(z) # Compute p(x|z) pxz = self.decode(z) logpxz_i = pxz(x) logpxz.append(logpxz_i) logw_i = logpz + logpxz_i - logqz logw.append(logw_i) # Self-normalize importance weights logw = F.stack(logw) # (num_zsamples,batchsize) lse = F.logsumexp(logw, axis=0) logw -= F.broadcast_to(lse, logw.shape) w = F.exp(logw) # Compute effective sample size ess = F.sum(1.0 / F.sum(w * w, axis=0)) / batchsize logpxz = F.stack(logpxz) # (num_zsamples,batchsize) # XXX: break dependency in computational graph w = chainer.Variable(w.data) obj = -F.sum(w * logpxz) / batchsize reporter.report({'obj': obj, 'ess': ess}, self) return obj
def __call__(self, x): batchsize = x.shape[0] logw = self.compute_logw(x) # IWAE = log (1/k) sum_i w_i logp = F.logsumexp(logw, axis=0) - math.log(self.num_zsamples) logp_mean = F.sum(logp) / batchsize obj = -logp_mean # Variance computation obj_c = logp - F.broadcast_to(logp_mean, logp.shape) obj_var = F.sum(obj_c * obj_c) / (batchsize - 1) obj_elbo = -self.compute_elbo(logw) reporter.report({ 'obj': obj, 'obj_var': obj_var, 'obj_elbo': obj_elbo }, self) return obj
def forward(self, inputs, device): x, = inputs return functions.logsumexp(x, axis=self.axis),
def test_invalid_axis_type(self): with self.assertRaises(TypeError): functions.logsumexp(self.x, [0])
def check_backward(self, x_data, y_grad, axis=None): gradient_check.check_backward( lambda x: functions.logsumexp(x, axis), x_data, y_grad, **self.check_backward_option)
def test_invalid_axis_type_in_tuple(self): with self.assertRaises(TypeError): functions.logsumexp(self.x, (1, 'x'))
def train(self, states, actions, action_logprobs, next_states, targets, xp, gamma=1): """ Return the loss function of the discriminator to be optimized. As in discriminator, we only want to discriminate the expert from learner, thus this is a binary classification problem. Unlike Discriminator used for GAIL, the discriminator in this class take a specific form, where exp{f(s, a)} D(s,a) = ------------------------- exp{f(s, a)} + \pi(a|s) """ # Create state-action pairs. Remember that both the agent's and the expert's s-a pairs are in the same list state_action = [] for state, action in zip(states, actions): action = np.array([0, 1]) if action == 0 else np.array([0, 1]) array = np.append(state, action) state_action.append(array.reshape((-1, 1))) # Get rewards for all s-a pairs rewards = self.reward_net( xp.asarray([s_a.T.astype('float32') for s_a in state_action])).data # Get values for current states current_values = self.value_net( xp.asarray([s.T.astype('float32') for s in states])).data # get values for next states next_values = self.value_net( xp.asarray([s.T.astype('float32') for s in next_states])).data # Define log p_tau(a|s) = r + gamma * V(s') - V(s) log_p_tau = rewards + gamma * next_values - current_values # log_q_tau = logprobs(pi(s)) = logprobs(a) calculated by policy net given a state # action_logprobs contains probs for both expert and agent log_q_tau = action_logprobs.reshape((-1, 1)) # Concatenate the rewards from discriminator and action probs from policy net to compute sum # After concatenation, log_pq should have size N * 2 log_pq = np.concatenate((log_p_tau, log_q_tau), axis=1) # logsumexp = softmax, i.e. for each row we take log(sum(exp(val))) so that we add together probabilities # and then go back to logprobs log_pq = F.logsumexp(log_pq, axis=1).data.reshape((-1, 1)) # Calculate D discrim_output = F.exp(log_p_tau - log_pq) # Calculate cross entropy loss loss = F.sigmoid_cross_entropy(log_p_tau - log_pq, np.ones_like(log_p_tau).astype(np.int)) loss += F.sigmoid_cross_entropy( log_q_tau - log_pq, np.zeros_like(log_q_tau).astype(np.int)) #loss = targets*(log_p_tau-log_pq) + (1-targets)*(log_q_tau-log_pq) # lång körning ozzy 3/3 #loss = -F.mean(loss) self.reward_net.cleargrads() self.value_net.cleargrads() self.reward_optimizer.update(loss.backward()) self.value_optimizer.update(loss.backward()) return loss
def main(): images = load_rgb_images(args.image_dir) # config discriminator_config = gan.config_discriminator generator_config = gan.config_generator # settings max_epoch = 1000 num_updates_per_epoch = 500 batchsize_true = 128 batchsize_fake = 128 plot_interval = 5 # seed np.random.seed(args.seed) if args.gpu_device != -1: cuda.cupy.random.seed(args.seed) # init weightnorm layers if discriminator_config.use_weightnorm: print "initializing weight normalization layers of the discriminator ..." x_true = sample_from_data(images, batchsize_true) gan.discriminate(x_true) if generator_config.use_weightnorm: print "initializing weight normalization layers of the generator ..." gan.generate_x(batchsize_fake) # training progress = Progress() for epoch in xrange(1, max_epoch + 1): progress.start_epoch(epoch, max_epoch) sum_loss_unsupervised = 0 sum_loss_adversarial = 0 sum_dx_unlabeled = 0 sum_dx_generated = 0 for t in xrange(num_updates_per_epoch): # sample data x_true = sample_from_data(images, batchsize_true) x_fake = gan.generate_x(batchsize_fake) x_fake.unchain_backward() # unsupervised loss # D(x) = Z(x) / {Z(x) + 1}, where Z(x) = \sum_{k=1}^K exp(l_k(x)) # softplus(x) := log(1 + exp(x)) # logD(x) = logZ(x) - log(Z(x) + 1) # = logZ(x) - log(exp(log(Z(x))) + 1) # = logZ(x) - softplus(logZ(x)) # 1 - D(x) = 1 / {Z(x) + 1} # log{1 - D(x)} = log1 - log(Z(x) + 1) # = -log(exp(log(Z(x))) + 1) # = -softplus(logZ(x)) log_zx_u, activations_u = gan.discriminate(x_true, apply_softmax=False) log_dx_u = log_zx_u - F.softplus(log_zx_u) dx_u = F.sum(F.exp(log_dx_u)) / batchsize_true loss_unsupervised = -F.sum( log_dx_u) / batchsize_true # minimize negative logD(x) py_x_g, _ = gan.discriminate(x_fake, apply_softmax=False) log_zx_g = F.logsumexp(py_x_g, axis=1) loss_unsupervised += F.sum(F.softplus( log_zx_g)) / batchsize_true # minimize negative log{1 - D(x)} # update discriminator gan.backprop_discriminator(loss_unsupervised) sum_loss_unsupervised += float(loss_unsupervised.data) sum_dx_unlabeled += float(dx_u.data) # generator loss x_fake = gan.generate_x(batchsize_fake) log_zx_g, activations_g = gan.discriminate(x_fake, apply_softmax=False) log_dx_g = log_zx_g - F.softplus(log_zx_g) dx_g = F.sum(F.exp(log_dx_g)) / batchsize_fake loss_generator = -F.sum( log_dx_g) / batchsize_true # minimize negative logD(x) # feature matching if discriminator_config.use_feature_matching: features_true = activations_u[-1] features_true.unchain_backward() if batchsize_true != batchsize_fake: x_fake = gan.generate_x(batchsize_true) _, activations_g = gan.discriminate(x_fake, apply_softmax=False) features_fake = activations_g[-1] loss_generator += F.mean_squared_error(features_true, features_fake) # update generator gan.backprop_generator(loss_generator) sum_loss_adversarial += float(loss_generator.data) sum_dx_generated += float(dx_g.data) if t % 10 == 0: progress.show(t, num_updates_per_epoch, {}) gan.save(args.model_dir) progress.show( num_updates_per_epoch, num_updates_per_epoch, { "loss_u": sum_loss_unsupervised / num_updates_per_epoch, "loss_g": sum_loss_adversarial / num_updates_per_epoch, "dx_u": sum_dx_unlabeled / num_updates_per_epoch, "dx_g": sum_dx_generated / num_updates_per_epoch, }) if epoch % plot_interval == 0 or epoch == 1: plot(filename="epoch_{}_time_{}min".format( epoch, progress.get_total_time()))
def test_duplicate_axis(self): with self.assertRaises(ValueError): functions.logsumexp(self.x, (0, 0))
def main(): # load MNIST images images, labels = dataset.load_train_images() # config discriminator_config = gan.config_discriminator generator_config = gan.config_generator # settings # _l -> labeled # _u -> unlabeled # _g -> generated max_epoch = 1000 num_trains_per_epoch = 500 plot_interval = 5 batchsize_l = 100 batchsize_u = 100 batchsize_g = batchsize_u # seed np.random.seed(args.seed) if args.gpu_device != -1: cuda.cupy.random.seed(args.seed) # save validation accuracy per epoch csv_results = [] # create semi-supervised split num_validation_data = 10000 num_labeled_data = args.num_labeled if batchsize_l > num_labeled_data: batchsize_l = num_labeled_data training_images_l, training_labels_l, training_images_u, validation_images, validation_labels = dataset.create_semisupervised( images, labels, num_validation_data, num_labeled_data, discriminator_config.ndim_output, seed=args.seed) print training_labels_l # training progress = Progress() for epoch in xrange(1, max_epoch): progress.start_epoch(epoch, max_epoch) sum_loss_supervised = 0 sum_loss_unsupervised = 0 sum_loss_adversarial = 0 sum_dx_labeled = 0 sum_dx_unlabeled = 0 sum_dx_generated = 0 gan.update_learning_rate(get_learning_rate_for_epoch(epoch)) for t in xrange(num_trains_per_epoch): # sample from data distribution images_l, label_onehot_l, label_ids_l = dataset.sample_labeled_data( training_images_l, training_labels_l, batchsize_l, discriminator_config.ndim_input, discriminator_config.ndim_output, binarize=False) images_u = dataset.sample_unlabeled_data( training_images_u, batchsize_u, discriminator_config.ndim_input, binarize=False) images_g = gan.generate_x(batchsize_g) images_g.unchain_backward() # supervised loss py_x_l, activations_l = gan.discriminate(images_l, apply_softmax=False) loss_supervised = F.softmax_cross_entropy( py_x_l, gan.to_variable(label_ids_l)) log_zx_l = F.logsumexp(py_x_l, axis=1) log_dx_l = log_zx_l - F.softplus(log_zx_l) dx_l = F.sum(F.exp(log_dx_l)) / batchsize_l # unsupervised loss # D(x) = Z(x) / {Z(x) + 1}, where Z(x) = \sum_{k=1}^K exp(l_k(x)) # softplus(x) := log(1 + exp(x)) # logD(x) = logZ(x) - log(Z(x) + 1) # = logZ(x) - log(exp(log(Z(x))) + 1) # = logZ(x) - softplus(logZ(x)) # 1 - D(x) = 1 / {Z(x) + 1} # log{1 - D(x)} = log1 - log(Z(x) + 1) # = -log(exp(log(Z(x))) + 1) # = -softplus(logZ(x)) py_x_u, _ = gan.discriminate(images_u, apply_softmax=False) log_zx_u = F.logsumexp(py_x_u, axis=1) log_dx_u = log_zx_u - F.softplus(log_zx_u) dx_u = F.sum(F.exp(log_dx_u)) / batchsize_u loss_unsupervised = -F.sum( log_dx_u) / batchsize_u # minimize negative logD(x) py_x_g, _ = gan.discriminate(images_g, apply_softmax=False) log_zx_g = F.logsumexp(py_x_g, axis=1) loss_unsupervised += F.sum(F.softplus( log_zx_g)) / batchsize_u # minimize negative log{1 - D(x)} # update discriminator gan.backprop_discriminator(loss_supervised + loss_unsupervised) # adversarial loss images_g = gan.generate_x(batchsize_g) py_x_g, activations_g = gan.discriminate(images_g, apply_softmax=False) log_zx_g = F.logsumexp(py_x_g, axis=1) log_dx_g = log_zx_g - F.softplus(log_zx_g) dx_g = F.sum(F.exp(log_dx_g)) / batchsize_g loss_adversarial = -F.sum( log_dx_g) / batchsize_u # minimize negative logD(x) # feature matching if discriminator_config.use_feature_matching: features_true = activations_l[-1] features_true.unchain_backward() if batchsize_l != batchsize_g: images_g = gan.generate_x(batchsize_l) _, activations_g = gan.discriminate(images_g, apply_softmax=False) features_fake = activations_g[-1] loss_adversarial += F.mean_squared_error( features_true, features_fake) # update generator gan.backprop_generator(loss_adversarial) sum_loss_supervised += float(loss_supervised.data) sum_loss_unsupervised += float(loss_unsupervised.data) sum_loss_adversarial += float(loss_adversarial.data) sum_dx_labeled += float(dx_l.data) sum_dx_unlabeled += float(dx_u.data) sum_dx_generated += float(dx_g.data) if t % 10 == 0: progress.show(t, num_trains_per_epoch, {}) gan.save(args.model_dir) # validation images_l, _, label_ids_l = dataset.sample_labeled_data( validation_images, validation_labels, num_validation_data, discriminator_config.ndim_input, discriminator_config.ndim_output, binarize=False) images_l_segments = np.split(images_l, num_validation_data // 500) label_ids_l_segments = np.split(label_ids_l, num_validation_data // 500) sum_accuracy = 0 for images_l, label_ids_l in zip(images_l_segments, label_ids_l_segments): y_distribution, _ = gan.discriminate(images_l, apply_softmax=True, test=True) accuracy = F.accuracy(y_distribution, gan.to_variable(label_ids_l)) sum_accuracy += float(accuracy.data) validation_accuracy = sum_accuracy / len(images_l_segments) progress.show( num_trains_per_epoch, num_trains_per_epoch, { "loss_l": sum_loss_supervised / num_trains_per_epoch, "loss_u": sum_loss_unsupervised / num_trains_per_epoch, "loss_g": sum_loss_adversarial / num_trains_per_epoch, "dx_l": sum_dx_labeled / num_trains_per_epoch, "dx_u": sum_dx_unlabeled / num_trains_per_epoch, "dx_g": sum_dx_generated / num_trains_per_epoch, "accuracy": validation_accuracy, }) # write accuracy to csv csv_results.append( [epoch, validation_accuracy, progress.get_total_time()]) data = pd.DataFrame(csv_results) data.columns = ["epoch", "accuracy", "min"] data.to_csv("{}/result.csv".format(args.model_dir)) if epoch % plot_interval == 0 or epoch == 1: plot(filename="epoch_{}_time_{}min".format( epoch, progress.get_total_time()))
def test_pos_neg_duplicate_axis(self): with self.assertRaises(ValueError): functions.logsumexp(self.x, (1, -2))
def check_backward(self, x_data, y_grad, axis=None): gradient_check.check_backward(lambda x: functions.logsumexp(x, axis), x_data, y_grad, **self.check_backward_option)
def calculate_logistic_loss(self, y, t): xp = chainer.cuda.get_array_module(t) if xp != numpy: xp.cuda.Device(t.device).use() nr_mix = y.shape[1] // 3 logit_probs = y[:, :nr_mix] means = y[:, nr_mix:2 * nr_mix] log_scales = y[:, 2 * nr_mix:3 * nr_mix] log_scales = F.maximum( log_scales, self.scalar_to_tensor(log_scales, self.log_scale_min)) t = F.broadcast_to(t, means.shape) centered_t = t - means inv_std = F.exp(-log_scales) plus_in = inv_std * (centered_t + 1 / (self.quantize - 1)) cdf_plus = F.sigmoid(plus_in) min_in = inv_std * (centered_t - 1 / (self.quantize - 1)) cdf_min = F.sigmoid(min_in) log_cdf_plus = plus_in - F.softplus(plus_in) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min # mid_in = inv_std * centered_t # log_pdf_mid = mid_in - log_scales - 2 * F.softplus(mid_in) log_probs = F.where( # condition t.array < self.scalar_to_tensor(t, -0.999), # true log_cdf_plus, # false F.where( # condition t.array > self.scalar_to_tensor(t, 0.999), # true log_one_minus_cdf_min, # false F.log(F.maximum( cdf_delta, self.scalar_to_tensor(cdf_delta, 1e-12))) # F.where( # # condition # cdf_delta.array > self.scalar_to_tensor(cdf_delta, 1e-5), # # true # F.log(F.maximum( # cdf_delta, self.scalar_to_tensor(cdf_delta, 1e-12))), # # false # log_pdf_mid - self.xp.log((self.quantize - 1) / 2)) )) log_probs = log_probs + F.log_softmax(logit_probs) loss = -F.mean(F.logsumexp(log_probs, axis=1)) return loss