def __call__(self, x): # chainer requires explicit broadcast for avoiding latent bugs u = F.mean(x, -1, keepdims=True) u = F.broadcast_to(u, x.shape) s = F.mean((x - u) ** 2, -1, keepdims=True) s = F.broadcast_to(s, x.shape) x = (x - u) / F.sqrt(s + self.e) return F.bias(F.scale(x, self.g, axis=2), self.b, axis=2)
def forward(self, inputs, device): x, y = inputs mean = functions.mean(x, axis=1) d = x - mean[:, None] var = functions.mean(d * d, axis=1) inv_std = functions.rsqrt(var + self.eps) dummy_gamma = self.backend_config.xp.ones( self.shape[0], dtype=self.dtype) return gn_module._MulInvStd( self.eps, mean.array, inv_std.array, dummy_gamma).apply((x, y))
def __call__(self, x): q_z = self.encoder(x) z = q_z.sample(self.k) p_x = self.decoder(z) p_z = self.prior() reconstr = F.mean(p_x.log_prob( F.broadcast_to(x[None, :], (self.k,) + x.shape))) kl_penalty = F.mean(chainer.kl_divergence(q_z, p_z)) loss = - (reconstr - self.beta * kl_penalty) reporter.report({'loss': loss}, self) reporter.report({'reconstr': reconstr}, self) reporter.report({'kl_penalty': kl_penalty}, self) return loss
def update_policy(): # Maximize Q(s,policy(s)) q = Q(obs, policy(obs)) q = q[:] # Avoid https://github.com/chainer/chainer/issues/2744 loss = - F.mean(q) policy.cleargrads() loss.backward() opt_policy.update()
def f_loss_grad(x): set_flat_params(self, x) self.cleargrads() values = self.compute_baselines(obs) loss = F.mean(F.square(values - targets)) loss.backward() flat_grad = get_flat_grad(self) return loss.data.astype(np.float64), flat_grad.astype(np.float64)
def predict(self, xs): # Encoding logits, exs = self._encode(xs) # Discretization D = F.gumbel_softmax(logits, self.tau, axis=2) gumbel_output = D.reshape(-1, self.M * self.K) with chainer.no_backprop_mode(): maxp = F.mean(F.max(D, axis=2)) reporter.report({'maxp': maxp.data}, self) # Decoding y_hat = self._decode(gumbel_output) return y_hat, exs
def test_backward_case2(self): """Backward if non-zero gradient is on a face.""" vertices = [ [0.8, 0.8, 1.], [-0.5, -0.8, 1.], [0.8, -0.8, 1.]] faces = [[0, 1, 2]] pyi = 40 pxi = 50 grad_ref = [ [0.98646867, 1.04628897, 0.], [-1.03415668, - 0.10403691, 0.], [3.00094461, - 1.55173182, 0.], ] renderer = neural_renderer.Renderer() renderer.image_size = 64 renderer.anti_aliasing = False renderer.perspective = False renderer.light_intensity_ambient = 1.0 renderer.light_intensity_directional = 0.0 vertices = cp.array(vertices, 'float32') faces = cp.array(faces, 'int32') textures = cp.ones((faces.shape[0], 4, 4, 4, 3), 'float32') grad_ref = cp.array(grad_ref, 'float32') vertices, faces, textures, grad_ref = utils.to_minibatch((vertices, faces, textures, grad_ref)) vertices = chainer.Variable(vertices) images = renderer.render(vertices, faces, textures) images = cf.mean(images, axis=1) loss = cf.sum(cf.absolute(images[:, pyi, pxi])) loss.backward() grad_ref = cp.array(grad_ref, 'float32') chainer.testing.assert_allclose(vertices.grad, grad_ref, rtol=1e-2)
def test_backward_case1(self): """Backward if non-zero gradient is out of a face.""" vertices = [ [0.8, 0.8, 1.], [0.0, -0.5, 1.], [0.2, -0.4, 1.]] faces = [[0, 1, 2]] pxi = 35 pyi = 25 grad_ref = [ [1.6725862, -0.26021874, 0.], [1.41986704, -1.64284933, 0.], [0., 0., 0.], ] renderer = neural_renderer.Renderer() renderer.image_size = 64 renderer.anti_aliasing = False renderer.perspective = False renderer.light_intensity_ambient = 1.0 renderer.light_intensity_directional = 0.0 vertices = cp.array(vertices, 'float32') faces = cp.array(faces, 'int32') textures = cp.ones((faces.shape[0], 4, 4, 4, 3), 'float32') grad_ref = cp.array(grad_ref, 'float32') vertices, faces, textures, grad_ref = utils.to_minibatch((vertices, faces, textures, grad_ref)) vertices = chainer.Variable(vertices) images = renderer.render(vertices, faces, textures) images = cf.mean(images, axis=1) loss = cf.sum(cf.absolute(images[:, pyi, pxi] - 1)) loss.backward() chainer.testing.assert_allclose(vertices.grad, grad_ref, rtol=1e-2)
def _head_to_tail(self, pool5): block5 = self.rcnn_top(pool5) # B, 1024, 1, 1 fc7 = F.mean(F.mean(block5, 3), 2) # B, 1024 return fc7
def __call__(self, x): return x / sqrt(mean(x ** 2, axis=1, keepdims=True) + 1e-8)
def encode_phrase(self, X): X = [F.mean(F.embed_id(x, self.w_vec), axis=0, keepdims=True) for x in X] return F.vstack(X)
def get_posi_from_img(img, threshold=0.2): """画像 img = [1][3 or 1][imgH][imgW] ⇒ 点位置 posi = [N][y,x] """ img_p = get_local_max_point(F.mean(img, axis=1, keepdims=True).data, threshold=threshold) posi = conv_point_to_posi(img_p) return posi
def loss_hinge_dis(dis_fake, dis_real): loss = F.mean(F.relu(1. - dis_real)) loss += F.mean(F.relu(1. + dis_fake)) return loss
def gp_loss(self, x, z): h = F.mean(x) / x.shape[0] grad, = chainer.grad([h], [z], enable_double_backprop=True) return F.mean(F.batch_l2_norm_squared(grad))
def feature_vector_normalize(x): alpha = 1.0 / F.sqrt(F.mean(x * x, axis=1, keepdims=True) + 1e-8) y = F.broadcast_to(alpha, x.data.shape) * x return y
def train(width, height, depth, start_alpha=0): g = generator(512, 512, 100) try: serializers.load_npz("generator.model", g) print("generator loaded") except: pass d = discriminator() try: serializers.load_npz("discriminator.model", d) print("discriminator loaded") except: pass g_opt = chainer.optimizers.Adam(alpha=0.001, beta1=0.0, beta2=0.99) g_opt.setup(g) g_opt.add_hook(chainer.optimizer.WeightDecay(0.0005)) d_opt = chainer.optimizers.Adam(alpha=0.001, beta1=0.0, beta2=0.99) d_opt.setup(d) d_opt.add_hook(chainer.optimizer.WeightDecay(0.0005)) X_train, tags = data_import(16 * (2**depth), 16 * (2**depth)) ''' X_train = (X_train.astype(np.float32) - 127.5)/127.5 X_train = X_train.transpose(0,3,1,2) ''' print(X_train.shape) tags = tags.astype(np.float32) num_batches = int(X_train.shape[0] / BATCH_SIZE) alpha = start_alpha for epoch in range(NUM_EPOCH): for index in range(num_batches): if alpha < 1.0: alpha = alpha + 5e-4 ''' x = xs[(j * bm):((j + 1) * bm)] t = ts[(j * bm):((j + 1) * bm)] ''' image_batch = X_train[index * BATCH_SIZE:(index + 1) * BATCH_SIZE] image_batch = (image_batch.astype(np.float32) - 127.5) / 127.5 image_batch = image_batch.transpose(0, 3, 1, 2) tag_batch = tags[index * BATCH_SIZE:(index + 1) * BATCH_SIZE] noise = np.random.normal(0, 0.5, [len(image_batch), 100]) z = Variable(noise.astype(np.float32)) x = g(z, tag_batch, depth, alpha) if index % 10 == 0: generated_images = x.data * 127.5 + 127.5 generated_images = generated_images.transpose(0, 2, 3, 1) save_generated_image(generated_images, "%04d_%04d.png" % (epoch, index)) yl = d(x, tag_batch, depth, alpha) #print(yl) #g_loss=F.mean_squared_error(yl, Variable(np.ones((len(image_batch),1), dtype=np.float32))) #d_loss=F.mean_squared_error(yl, Variable(np.zeros((len(image_batch),1), dtype=np.float32))) yl2 = d(image_batch, tag_batch, depth, alpha) #print(yl2) #d_loss+=F.mean_squared_error(yl2, Variable(np.ones((len(image_batch),1), dtype=np.float32))) d_loss = -F.sum(yl2 - yl) / len(image_batch) d_loss += F.mean(0.001 * yl * yl) g_loss = -F.sum(yl) / len(image_batch) ''' mean=F.mean(x,axis=0) dev=x-F.broadcast_to(mean, x.shape) devdev=dev*dev var=F.mean(devdev) g_loss-= var ''' g.cleargrads() g_loss.backward() g_opt.update() d.cleargrads() d_loss.backward() d_opt.update() print( "epoch %d, batch: %d, g_loss: %f, d_loss: %f, alpha: %f, depth: %d" % (epoch, index, g_loss.data, d_loss.data, alpha, depth)) serializers.save_npz('generator.model', g) serializers.save_npz('discriminator.model', d)
def loss_func_adv_gen(self, y_fake): return F.mean(y_fake)
def loss_func_adv_dis_real(self, y_real): return F.mean(y_real)
def loss_func_adv_dis_fake(self, y_fake): return F.mean(y_fake)
fake_2 = F.average_pooling_2d(fake, 3, 2, 1) fake_4 = F.average_pooling_2d(fake_2, 3, 2, 1) dis_fake, _ = discriminator(F.concat([fake, line])) dis2_fake, _ = discriminator_2(F.concat([fake_2, line_2])) dis4_fake, _ = discriminator_4(F.concat([fake_4, line_4])) dis_color, _ = discriminator(F.concat([color, line])) dis2_color, _ = discriminator_2(F.concat([color_2, line_2])) dis4_color, _ = discriminator_4(F.concat([color_4, line_4])) fake.unchain_backward() fake_2.unchain_backward() fake_4.unchain_backward() adver_loss = F.mean(F.softplus(-dis_color)) + F.mean( F.softplus(dis_fake)) adver_loss += F.mean(F.softplus(-dis2_color)) + F.mean( F.softplus(dis2_fake)) adver_loss += F.mean(F.softplus(-dis4_color)) + F.mean( F.softplus(dis4_fake)) discriminator.cleargrads() discriminator_2.cleargrads() discriminator_4.cleargrads() discriminator.to_gpu() adver_loss.backward() dis_opt.update() dis2_opt.update() dis4_opt.update() adver_loss.unchain_backward()
def normalize(self, z): return z / sqrt(mean(z**2, axis=1, keepdims=True) + 1e-8)
def make_input_x(self, x, mask, xp): x_fill = F.mean(x, axis=(2, 3))[:, :, xp.newaxis, xp.newaxis] x_shape = x.shape return x * F.broadcast_to((1 - mask), x_shape) + F.broadcast_to( x_fill, x_shape) * F.broadcast_to(mask, x_shape)
def batch_pit_n_speaker_loss(ys, ts, n_speakers_list): """ PIT loss over mini-batch. Args: ys: B-length list of predictions (pre-activations) ts: B-length list of labels n_speakers_list: list of n_speakers in batch Returns: loss: (1,)-shape mean cross entropy over mini-batch labels: B-length list of permuted labels """ max_n_speakers = ts[0].shape[1] xp = chainer.backend.get_array_module(ys[0]) # (B, T, C) ys = F.pad_sequence(ys, padding=-1) losses = [] for shift in range(max_n_speakers): # rolled along with speaker-axis ts_roll = [xp.roll(t, -shift, axis=1) for t in ts] ts_roll = F.pad_sequence(ts_roll, padding=-1) # loss: (B, T, C) loss = F.sigmoid_cross_entropy(ys, ts_roll, reduce='no') # sum over time: (B, C) loss = F.sum(loss, axis=1) losses.append(loss) # losses: (B, C, C) losses = F.stack(losses, axis=2) # losses[b, i, j] is a loss between # `i`-th speaker in y and `(i+j)%C`-th speaker in t perms = xp.array( list(permutations(range(max_n_speakers))), dtype='i', ) # y_ind: [0,1,2,3] y_ind = xp.arange(max_n_speakers, dtype='i') # perms -> relation to t_inds -> t_inds # 0,1,2,3 -> 0+j=0,1+j=1,2+j=2,3+j=3 -> 0,0,0,0 # 0,1,3,2 -> 0+j=0,1+j=1,2+j=3,3+j=2 -> 0,0,1,3 t_inds = xp.mod(perms - y_ind, max_n_speakers) losses_perm = [] for t_ind in t_inds: losses_perm.append(F.mean(losses[:, y_ind, t_ind], axis=1)) # losses_perm: (B, Perm) losses_perm = F.stack(losses_perm, axis=1) # masks: (B, Perms) def select_perm_indices(num, max_num): perms = list(permutations(range(max_num))) sub_perms = list(permutations(range(num))) return [[x[:num] for x in perms].index(perm) for perm in sub_perms] masks = xp.full_like(losses_perm.array, xp.inf) for i, t in enumerate(ts): n_speakers = n_speakers_list[i] indices = select_perm_indices(n_speakers, max_n_speakers) masks[i, indices] = 0 losses_perm += masks min_loss = F.sum(F.min(losses_perm, axis=1)) n_frames = np.sum([t.shape[0] for t in ts]) min_loss = min_loss / n_frames min_indices = xp.argmin(losses_perm.array, axis=1) labels_perm = [t[:, perms[idx]] for t, idx in zip(ts, min_indices)] labels_perm = [ t[:, :n_speakers] for t, n_speakers in zip(labels_perm, n_speakers_list) ] return min_loss, labels_perm
def bs_reg(self): bs_re = F.mean(F.square(self.linear.W)) return bs_re
def get_loss(self, batch_data): config = self.config batch_pos = batch_data / 127.5 - 1 bbox = random_bbox(config) mask = bbox2mask(bbox, batch_data.shape[0], config, self.xp) batch_incomplete = batch_pos * (1 - mask) x1, x2, offset_flow = self.inpaintnet(batch_incomplete, mask, config) if config.PRETRAIN_COARSE_NETWORK: batch_predicted = x1 else: batch_predicted = x2 losses = {} # apply mask and complete image batch_complete = batch_predicted * mask + batch_incomplete * (1 - mask) # local patches local_patch_batch_pos = local_patch(batch_pos, bbox) local_patch_x1 = local_patch(x1, bbox) local_patch_x2 = local_patch(x2, bbox) local_patch_batch_complete = local_patch(batch_complete, bbox) local_patch_mask = local_patch(mask, bbox) l1_alpha = config.COARSE_L1_ALPHA losses["l1_loss"] = l1_alpha * F.mean( F.absolute(local_patch_batch_pos - local_patch_x1) * spatial_discounting_mask(config, self.xp)) if not config.PRETRAIN_COARSE_NETWORK: losses['l1_loss'] += F.mean( F.absolute(local_patch_batch_pos - local_patch_x2) * spatial_discounting_mask(config, self.xp)) losses['ae_loss'] = l1_alpha * F.mean( F.absolute(batch_pos - x1) * (1. - mask)) if not config.PRETRAIN_COARSE_NETWORK: losses['ae_loss'] += F.mean( F.absolute(batch_pos - x2) * (1. - mask)) losses['ae_loss'] /= F.mean(1. - mask) # gan batch_pos_neg = F.concat([batch_pos, batch_complete], axis=0) # local deterministic patch local_patch_batch_pos_neg = F.concat( [local_patch_batch_pos, local_patch_batch_complete], 0) if config.GAN_WITH_MASK: batch_pos_neg = F.concat([batch_pos_neg, mask], axis=1) # wgan with gradient penalty if config.GAN == 'wgan_gp': # seperate gan pos_neg_local, pos_neg_global = self.discriminator( local_patch_batch_pos_neg, batch_pos_neg) pos_local, neg_local = F.split_axis(pos_neg_local, 2, axis=0) pos_global, neg_global = F.split_axis(pos_neg_global, 2, axis=0) # wgan loss g_loss_local, d_loss_local = gan_wgan_loss(pos_local, neg_local) g_loss_global, d_loss_global = gan_wgan_loss( pos_global, neg_global) losses[ 'g_loss'] = config.GLOBAL_WGAN_LOSS_ALPHA * g_loss_global + g_loss_local losses['d_loss'] = d_loss_global + d_loss_local # gp interpolates_local = random_interpolates( local_patch_batch_pos, local_patch_batch_complete) interpolates_global = random_interpolates(batch_pos, batch_complete) dout_local, dout_global = self.discriminator( interpolates_local, interpolates_global) # apply penalty penalty_local = gradients_penalty(interpolates_local, dout_local, mask=local_patch_mask) penalty_global = gradients_penalty(interpolates_global, dout_global, mask=mask) losses['gp_loss'] = config.WGAN_GP_LAMBDA * (penalty_local + penalty_global) losses['d_loss'] = losses['d_loss'] + losses['gp_loss'] if config.PRETRAIN_COARSE_NETWORK: losses['g_loss'] = 0 else: losses['g_loss'] = config.GAN_LOSS_ALPHA * losses['g_loss'] losses['g_loss'] += config.L1_LOSS_ALPHA * losses['l1_loss'] if config.AE_LOSS: losses['g_loss'] += config.AE_LOSS_ALPHA * losses['ae_loss'] return losses
def loss_hinge_gen(dis_fake): loss = -F.mean(dis_fake) return loss
def gan_sngan_loss(pos, neg, d_loss_only=False): # SN-PatchGAN loss with hinge loss d_loss = F.mean(F.relu(1 - pos) + F.relu(1 + neg)) g_loss = None if d_loss_only else -F.mean(neg) return g_loss, d_loss
def calculate_logistic_loss(self, y, t): xp = chainer.cuda.get_array_module(t) if xp != numpy: xp.cuda.Device(t.device).use() nr_mix = y.shape[1] // 3 logit_probs = y[:, :nr_mix] means = y[:, nr_mix:2 * nr_mix] log_scales = y[:, 2 * nr_mix:3 * nr_mix] log_scales = F.maximum( log_scales, self.scalar_to_tensor(log_scales, self.log_scale_min)) t = F.broadcast_to(127.5 * t, means.shape) centered_t = t - means inv_std = F.exp(-log_scales) plus_in = inv_std * (centered_t + 127.5 / (self.quantize - 1)) cdf_plus = F.sigmoid(plus_in) min_in = inv_std * (centered_t - 127.5 / (self.quantize - 1)) cdf_min = F.sigmoid(min_in) log_cdf_plus = plus_in - F.softplus(plus_in) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min # mid_in = inv_std * centered_t # log_pdf_mid = mid_in - log_scales - 2 * F.softplus(mid_in) log_probs = F.where( # condition t.array < self.scalar_to_tensor(t, 127.5 * -0.999), # true log_cdf_plus, # false F.where( # condition t.array > self.scalar_to_tensor(t, 127.5 * 0.999), # true log_one_minus_cdf_min, # false F.log( F.maximum(cdf_delta, self.scalar_to_tensor(cdf_delta, 1e-12))) # F.where( # # condition # cdf_delta.array > self.scalar_to_tensor(cdf_delta, 1e-5), # # true # F.log(F.maximum( # cdf_delta, self.scalar_to_tensor(cdf_delta, 1e-12))), # # false # log_pdf_mid - self.xp.log((self.quantize - 1) / 2)) )) log_probs = log_probs + F.log_softmax(logit_probs) loss = -F.mean(F.logsumexp(log_probs, axis=1)) return loss
def square_loss(ys, ts): # return F.mean(F.sqrt((ys - ts) ** 2 + 1e-5), axis=(0, 2)) return F.mean((ys - ts) ** 2 + 1e-5, axis=(0, 2))
def mean_clipped_loss(y, t): return F.mean(F.huber_loss(y, t, delta=1.0, reduce='no'))
def __call__(self, x, ys, yb): s = broadcast_to(ys.reshape(ys.shape + (1, 1)), ys.shape + x.shape[2:]) b = broadcast_to(yb.reshape(yb.shape + (1, 1)), yb.shape + x.shape[2:]) e = x - broadcast_to(mean(x, axis=1, keepdims=True), x.shape) sd = broadcast_to(sqrt(mean(e ** 2, axis=1, keepdims=True) + 1e-8), x.shape) return s * e / sd + b
def feature_vector_normalization(x, eps=1e-8): # x: (B, C, H, W) alpha = 1.0 / F.sqrt(F.mean(x * x, axis=1, keepdims=True) + eps) return F.broadcast_to(alpha, x.data.shape) * x
inp = prepare_dataset(inp) input_box.append(inp) img = dir_path + "/" + str(index) + ".png" img = prepare_dataset(img) frame_box.append(img) x = chainer.as_variable(xp.array(input_box).astype(xp.float32)) t = chainer.as_variable(xp.array(frame_box).astype(xp.float32)) embed = feature_extractor(t) - feature_extractor(x) c = feature_embed(embed) z = F.concat([x, c], axis=1) y = predictor(z) y_dis = discriminator_content(y) t_dis = discriminator_content(t) dis_loss = F.mean(F.softplus(-t_dis)) + F.mean(F.softplus(y_dis)) c_g = feature_extractor(y) - feature_extractor(make_diff(y)) c_dis = discriminator_sequence(embed) c_g_dis = discriminator_sequence(c_g) dis_loss += F.mean(F.softplus(-c_dis)) + F.mean(F.softplus(c_g_dis)) c_g.unchain_backward() discriminator_content.cleargrads() discriminator_sequence.cleargrads() dis_loss.backward() dis_c_opt.update() dis_s_opt.update() dis_loss.unchain_backward()
def loss_softmax_cross_entropy(predict, ground_truth): eps = 1e-16 cross_entropy = -F.mean(F.log(predict + eps) * ground_truth) return cross_entropy
def update_core(self): gen_optimizer = self.get_optimizer('opt_gen') dis_optimizer = self.get_optimizer('opt_dis') xp = self.gen.xp for i in range(self.n_dis): batch = self.get_iterator('main').next() batchsize = len(batch) x = [] for j in range(batchsize): x.append(np.asarray(batch[j]).astype("f")) x_real = Variable(xp.asarray(x)) h_real = self.dis(x_real) z = Variable(xp.asarray(self.gen.make_hidden(batchsize))) x_fake1 = self.gen(z) h_fake1 = self.dis(x_fake1) z2 = Variable(xp.asarray(self.gen.make_hidden(batchsize))) x_fake2 = self.gen(z2) h_fake2 = self.dis(x_fake2) def l2_distance(a, b): return F.sqrt(F.sum((a - b) ** 2, axis=1, keepdims=True)) def backward_l2_distance(g, a, b): out = F.broadcast_to(l2_distance(a, b), a.data.shape) g = F.broadcast_to(g, a.data.shape) return g * (a - b) / out, g * (b - a) / out def energy_distance(r, f1, f2): ret = l2_distance(r, f1) ret += l2_distance(r, f2) ret -= l2_distance(f1, f2) return F.mean(ret) def critic(a, b): return l2_distance(a, b) - l2_distance(a, xp.zeros_like(a.data)) def backward_critic(g, a, b): ga0, gb0 = backward_l2_distance(g, a, b) ga1, gb1 = backward_l2_distance(g, a, xp.zeros_like(a.data)) return ga0 - ga1, gb0 - gb1 critic_real = critic(h_real, h_fake2) critic_fake = critic(h_fake1, h_fake2) loss_surrogate = F.mean(critic_real - critic_fake) if i == 0: loss_gen = energy_distance(h_real, h_fake1, h_fake2) self.gen.cleargrads() loss_gen.backward() gen_optimizer.update() chainer.reporter.report({'loss_gen': loss_gen}) x_fake1.unchain_backward() x_fake2.unchain_backward() eps = xp.random.uniform(0, 1, size=batchsize).astype("f")[:, None, None, None] x_mid = eps * x_real + (1.0 - eps) * x_fake1 h_mid = Variable(self.dis(x_mid).data) critic_mid = critic(h_mid, h_fake2.data) # calc gradient penalty g = Variable(xp.ones_like(critic_mid.data)) dydh, _ = backward_critic(g, h_mid, h_fake2.data) dydx = self.dis.differentiable_backward(dydh) dydx_norm = F.sqrt(F.sum(dydx ** 2, axis=(1, 2, 3))) loss_gp = self.lam * F.mean_squared_error(dydx_norm, xp.ones_like(dydx_norm.data)) self.dis.cleargrads() (-loss_surrogate).backward() loss_gp.backward() dis_optimizer.update() chainer.reporter.report({'loss_dis': loss_surrogate}) chainer.reporter.report({'loss_gp': loss_gp}) chainer.reporter.report({'g': F.mean(dydx_norm)})
def run_n_games(optimizer, learner, opponent, num_games): states.default_start_position() # Create one list of features (aka state tensors) and one of moves for each game being played. features1_tensors = [[] for _ in range(num_games)] features2_tensors = [[] for _ in range(num_games)] labels_tensors = [[] for _ in range(num_games)] values_tensors = [[] for _ in range(num_games)] # List of booleans indicating whether the 'learner' player won. learner_won = [None] * num_games # Start all odd games with moves by 'opponent'. Even games will have 'learner' black. learner_color = [BLACK if i % 2 == 0 else WHITE for i in range(num_games)] odd_features1 = np.empty((num_games, 2 * 14, 9, 9), dtype=np.float32) odd_features2 = np.empty((num_games, 2 * MAX_PIECES_IN_HAND_SUM + 1, 9, 9), dtype=np.float32) states.make_odd_input_features(odd_features1, odd_features2) x1 = Variable(cuda.to_gpu(odd_features1)) x2 = Variable(cuda.to_gpu(odd_features2)) with chainer.no_backprop_mode(): with chainer.using_config('train', False): y = opponent(x1, x2) y_data = cuda.to_cpu(y.data) states.do_odd_moves(y_data) current = learner other = opponent unfinished_states_num = num_games move_number_sum = 0 while unfinished_states_num > 0: move_number_sum += unfinished_states_num # Get next moves by current player for all unfinished states. features1 = np.empty((unfinished_states_num, FEATURES1_NUM, 9, 9), dtype=np.float32) features2 = np.empty((unfinished_states_num, FEATURES2_NUM, 9, 9), dtype=np.float32) unfinished_list = states.make_unfinished_input_features( features1, features2) x1 = Variable(cuda.to_gpu(features1)) x2 = Variable(cuda.to_gpu(features2)) with chainer.no_backprop_mode(): with chainer.using_config('train', False): y = current(x1, x2) y_data = cuda.to_cpu(y.data) labels = np.empty((unfinished_states_num), dtype=np.int32) values = np.empty((unfinished_states_num), dtype=np.float32) unfinished_states_num = states.do_unfinished_moves_and_eval( current is learner, y_data, labels, values) # 特徴を保存 if current is learner: for i, idx in enumerate(unfinished_list): features1_tensors[idx].append(features1[i]) features2_tensors[idx].append(features2[i]) labels_tensors[idx].append(labels[i]) values_tensors[idx].append(values[i]) # Swap 'current' and 'other' for next turn. current, other = other, current learner_won = np.empty(num_games, dtype=np.int32) states.get_learner_wons(learner_won) # Train on all game's results features1_tensor_all = [] features2_tensor_all = [] labels_tensor_all = [] rewards_tensor_all = [] for features1_tensor, features2_tensor, labels_tensor, values_tensor, won in zip( features1_tensors, features2_tensors, labels_tensors, values_tensors, learner_won.astype(np.float32)): features1_tensor_all.extend(features1_tensor) features2_tensor_all.extend(features2_tensor) labels_tensor_all.extend(labels_tensor) rewards_tensor_all.extend( list(won - np.array(values_tensor, dtype=np.float32))) x1 = Variable(cuda.to_gpu(np.array(features1_tensor_all, dtype=np.float32))) x2 = Variable(cuda.to_gpu(np.array(features2_tensor_all, dtype=np.float32))) t = Variable(cuda.to_gpu(np.array(labels_tensor_all, dtype=np.int32))) z = Variable(cuda.to_gpu(np.array(rewards_tensor_all, dtype=np.float32))) y = learner(x1, x2) learner.cleargrads() loss = F.mean(F.softmax_cross_entropy(y, t, reduce='no') * z) loss.backward() optimizer.update() # Return the win ratio. return np.average( learner_won), float(move_number_sum) / num_games, loss.data
def __call__(self, h, adj): xp = self.xp # (minibatch, atom, channel) mb, atom, ch = h.shape # (minibatch, atom, EDGE_TYPE * heads * out_dim) h = self.message_layer(h) # (minibatch, atom, EDGE_TYPE, heads, out_dim) h = functions.reshape(h, (mb, atom, self.n_edge_types, self.n_heads, self.out_channels)) # concat all pairs of atom # (minibatch, 1, atom, heads, out_dim) h_i = functions.reshape(h, (mb, 1, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, heads, out_dim) h_i = functions.broadcast_to(h_i, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, 1, EDGE_TYPE, heads, out_dim) h_j = functions.reshape(h, (mb, atom, 1, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim) h_j = functions.broadcast_to(h_j, (mb, atom, atom, self.n_edge_types, self.n_heads, self.out_channels)) # (minibatch, atom, atom, EDGE_TYPE, heads, out_dim * 2) e = functions.concat([h_i, h_j], axis=5) # (minibatch, EDGE_TYPE, heads, atom, atom, out_dim * 2) e = functions.transpose(e, (0, 3, 4, 1, 2, 5)) # (minibatch * EDGE_TYPE * heads, atom * atom, out_dim * 2) e = functions.reshape(e, (mb * self.n_edge_types * self.n_heads, atom * atom, self.out_channels * 2)) # (minibatch * EDGE_TYPE * heads, atom * atom, 1) e = self.attention_layer(e) # (minibatch, EDGE_TYPE, heads, atom, atom) e = functions.reshape(e, (mb, self.n_edge_types, self.n_heads, atom, atom)) e = functions.leaky_relu(e, self.negative_slope) # (minibatch, EDGE_TYPE, atom, atom) if isinstance(adj, chainer.Variable): cond = adj.array.astype(xp.bool) else: cond = adj.astype(xp.bool) # (minibatch, EDGE_TYPE, 1, atom, atom) cond = xp.reshape(cond, (mb, self.n_edge_types, 1, atom, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) cond = xp.broadcast_to(cond, e.array.shape) # TODO(mottodora): find better way to ignore non connected e = functions.where(cond, e, xp.broadcast_to(xp.array(-10000), e.array.shape) .astype(xp.float32)) # In Relational Graph Attention Networks eq.(7) # ARGAT: take the softmax over the logits across node neighborhoods # irrespective of relation if self.softmax_mode == 'across': # (minibatch, heads, atom, EDGE_TYPE, atom) e = functions.transpose(e, (0, 2, 3, 1, 4)) # (minibatch, heads, atom, EDGE_TYPE * atom) e = functions.reshape(e, (mb, self.n_heads, atom, self.n_edge_types * atom)) # (minibatch, heads, atom, EDGE_TYPE * atom) alpha = functions.softmax(e, axis=3) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) # (minibatch, heads, atom, EDGE_TYPE, atom) alpha = functions.reshape(alpha, (mb, self.n_heads, atom, self.n_edge_types, atom)) # (minibatch, EDGE_TYPE, heads, atom, atom) alpha = functions.transpose(alpha, (0, 3, 1, 2, 4)) # In Relational Graph Attention Networks eq.(6) # WIRGAT: take the softmax over the logits independently for each # relation elif self.softmax_mode == 'within': alpha = functions.softmax(e, axis=4) if self.dropout_ratio >= 0: alpha = functions.dropout(alpha, ratio=self.dropout_ratio) else: raise ValueError("{} is invalid. Please use 'across' or 'within'" .format(self.softmax_mode)) # before: (minibatch, atom, EDGE_TYPE, heads, out_dim) # after: (minibatch, EDGE_TYPE, heads, atom, out_dim) h = functions.transpose(h, (0, 2, 3, 1, 4)) # (minibatch, EDGE_TYPE, heads, atom, out_dim) h_new = functions.matmul(alpha, h) # (minibatch, heads, atom, out_dim) h_new = functions.sum(h_new, axis=1) if self.concat_heads: # (heads, minibatch, atom, out_dim) h_new = functions.transpose(h_new, (1, 0, 2, 3)) # (minibatch, atom, heads * out_dim) h_new = functions.concat(h_new, axis=2) else: # (minibatch, atom, out_dim) h_new = functions.mean(h_new, axis=1) return h_new
def energy_distance(r, f1, f2): ret = l2_distance(r, f1) ret += l2_distance(r, f2) ret -= l2_distance(f1, f2) return F.mean(ret)
def __call__(self, xs): y_hat, input_embeds = self.predict(xs) loss = 0.5 * F.sum((y_hat - input_embeds) ** 2, axis=1) loss = F.mean(loss) reporter.report({'loss': loss.data}, self) return loss
def compute_marginal_entropy(self, p_batch): return self.compute_entropy(functions.mean(p_batch, axis=0))
# train itr = 0 sum_loss = 0 eval_interval = 1000 for e in range(args.epoch): np.random.shuffle(train_data) itr_epoch = 0 sum_loss_epoch = 0 for i in range(0, len(train_data) - args.batchsize, args.batchsize): x1, x2, t, z = mini_batch(train_data[i:i + args.batchsize]) y = model(x1, x2) model.cleargrads() loss = F.mean(F.softmax_cross_entropy(y, t, reduce='no') * z) loss.backward() optimizer.update() itr += 1 sum_loss += loss.data itr_epoch += 1 sum_loss_epoch += loss.data # print train loss if optimizer.t % eval_interval == 0: logging.info('epoch = {}, iteration = {}, loss = {}'.format( optimizer.epoch + 1, optimizer.t, sum_loss / itr)) itr = 0 sum_loss = 0
def main(): parser = argparse.ArgumentParser(description='GradNorm') parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--n-iter', '-it', type=int, default=5000) parser.add_argument('--mode', '-m', choices=('grad_norm', 'equal_weight'), default='grad_norm') args = parser.parse_args() np.random.seed(123) sigmas = [1, 10] n_task = len(sigmas) epsilons = np.random.normal( scale=3.5, size=(n_task, 100, 250)).astype(np.float32) dataset = RegressionDataset(sigmas, epsilons) model = RegressionTrainChain(RegressionChain(n_task)) if args.gpu >= 0: chainer.backends.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = chainer.optimizers.Adam(alpha=1e-2) optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(dataset, 200) xp = model.xp weights = [] task_losses = [] loss_ratios = [] final_layer_names = ['task_{}'.format(i) for i in range(n_task)] for t in range(args.n_iter): batch = train_iter.next() x, ts = chainer.dataset.convert.concat_examples(batch, device=args.gpu) task_loss = model(x, ts) weighted_task_loss = model.weight * task_loss if t == 0: initial_task_loss = task_loss.data loss = F.mean(weighted_task_loss) model.cleargrads() loss.backward() # Ignore a gradient to the coefficient vector, which # is computed from the standard loss. model.weight.cleargrad() if args.mode == 'grad_norm': # Use |\nabla_W w_i * L_i | = w_i |\nabla_W L_i| gygw_norms = [] for i, layer_name in enumerate(final_layer_names): l = getattr(model.model, layer_name) gygw = chainer.grad([task_loss[i]], [l.W])[0].data gygw_norms.append(xp.linalg.norm(gygw)) gygw_norms = xp.stack(gygw_norms) norms = model.weight * gygw_norms alpha = 0.16 mean_norm = xp.mean(norms.data) loss_ratio = task_loss.data / initial_task_loss inverse_train_rate = loss_ratio / xp.mean(loss_ratio) diff = norms - (inverse_train_rate ** alpha) * mean_norm grad_norm_loss = F.mean(F.absolute(diff)) grad_norm_loss.backward() # For debugging purpose only # from chainer import computational_graph # import os # cg = computational_graph.build_computational_graph( # [grad_norm_loss]).dump() # with open('grad_weight_loss_cg', 'w') as f: # f.write(cg) optimizer.update() # Renormalize normalize_coeff = n_task / xp.sum(model.weight.data) model.weight.data[:] = model.weight.data * normalize_coeff # Record task_losses.append(chainer.backends.cuda.to_cpu(task_loss.data)) loss_ratios.append(np.mean(task_losses[-1] / task_losses[0])) weights.append(chainer.backends.cuda.to_cpu(model.weight.data)) if t % 100 == 0: print('{}/{}: loss_ratio={}, weights={} task_loss={}'.format( t, args.n_iter, loss_ratios[-1], model.weight.data, task_loss.data)) task_losses = np.array(task_losses) weights = np.array(weights) fig = plt.figure() ax1 = fig.add_subplot(1, 4, 1) ax1.set_title('loss (task 0)') ax2 = fig.add_subplot(1, 4, 2) ax2.set_title('loss (task 1)') ax3 = fig.add_subplot(1, 4, 3) ax3.set_title('sum of normalized losses') ax4 = fig.add_subplot(1, 4, 4) ax4.set_title('change of weights over time') ax1.plot(task_losses[:, 0]) ax2.plot(task_losses[:, 1]) ax3.plot(loss_ratios) ax4.plot(weights[:, 0]) ax4.plot(weights[:, 1]) plt.show()