def __call__(self, y, hiddens=None, scale=True): ne_loss = 0 # NE for hiddens if hiddens is not None: for h in hiddens: h_normalized = F.softmax(h) h_log_softmax = F.log_softmax(h) n = h.data.shape[0] l = - F.sum(h_normalized * h_log_softmax) / n if scale: d = np.prod(h.data.shape[1:]) l = l / d ne_loss += l # NE for output y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) n = y.data.shape[0] l = - F.sum(y_normalized * y_log_softmax) / n if scale: d = np.prod(y.data.shape[1:]) l = l / d ne_loss += l return ne_loss
def dirichlet_likelihood(weights, alpha=None): """ Calculate the log likelihood of the observed topic proportions. A negative likelihood is more likely than a negative likelihood. Args: weights (chainer.Variable): Unnormalized weight vector. The vector will be passed through a softmax function that will map the input onto a probability simplex. alpha (float): The Dirichlet concentration parameter. Alpha greater than 1.0 results in very dense topic weights such that each document belongs to many topics. Alpha < 1.0 results in sparser topic weights. The default is to set alpha to 1.0 / n_topics, effectively enforcing the prior belief that a document belong to very topics at once. Returns: ~chainer.Variable: Output loss variable. """ if type(weights) is Variable: n_topics = weights.data.shape[1] else: n_topics = weights.W.data.shape[1] if alpha is None: alpha = 1.0 / n_topics if type(weights) is Variable: log_proportions = F.log_softmax(weights) else: log_proportions = F.log_softmax(weights.W) loss = (alpha - 1.0) * log_proportions return -F.sum(loss)
def beam_search(dec,state,y,data,beam_width,mydict_inv): beam_width=beam_width xp=cuda.cupy batchsize=data.shape[0] vocab_size=len(mydict_inv) topk=20 route = np.zeros((batchsize,beam_width,50)).astype(np.int32) for j in range(50): if j == 0: y = Variable(xp.array(np.argmax(y.data.get(), axis=1)).astype(xp.int32)) state,y = dec(y, state, train=False) h=state['h1'].data c=state['c1'].data h=xp.tile(h.reshape(batchsize,1,-1), (1,beam_width,1)) c=xp.tile(c.reshape(batchsize,1,-1), (1,beam_width,1)) ptr=F.log_softmax(y).data.get() pred_total_city = np.argsort(ptr)[:,::-1][:,:beam_width] pred_total_score = np.sort(ptr)[:,::-1][:,:beam_width] route[:,:,j] = pred_total_city pred_total_city=pred_total_city.reshape(batchsize,beam_width,1) else: pred_next_score=np.zeros((batchsize,beam_width,topk)) pred_next_city=np.zeros((batchsize,beam_width,topk)).astype(np.int32) score2idx=np.zeros((batchsize,beam_width,topk)).astype(np.int32) for b in range(beam_width): state={'c1':Variable(c[:,b,:]), 'h1':Variable(h[:,b,:])} cur_city = xp.array([pred_total_city[i,b,j-1] for i in range(batchsize)]).astype(xp.int32) state,y = dec(cur_city,state, train=False) h[:,b,:]=state['h1'].data c[:,b,:]=state['c1'].data ptr=F.log_softmax(y).data.get() pred_next_score[:,b,:]=np.sort(ptr, axis=1)[:,::-1][:,:topk] pred_next_city[:,b,:]=np.argsort(ptr, axis=1)[:,::-1][:,:topk] h=F.stack([h for i in range(topk)], axis=2).data c=F.stack([c for i in range(topk)], axis=2).data pred_total_city = np.tile(route[:,:,:j],(1,1,topk)).reshape(batchsize,beam_width,topk,j) pred_next_city = pred_next_city.reshape(batchsize,beam_width,topk,1) pred_total_city = np.concatenate((pred_total_city,pred_next_city),axis=3) pred_total_score = np.tile(pred_total_score.reshape(batchsize,beam_width,1),(1,1,topk)).reshape(batchsize,beam_width,topk,1) pred_next_score = pred_next_score.reshape(batchsize,beam_width,topk,1) pred_total_score += pred_next_score idx = pred_total_score.reshape(batchsize,beam_width * topk).argsort(axis=1)[:,::-1][:,:beam_width] pred_total_city = pred_total_city[:,idx//topk, np.mod(idx,topk), :][np.diag_indices(batchsize,ndim=2)].reshape(batchsize,beam_width,j+1) pred_total_score = pred_total_score[:,idx//topk, np.mod(idx,topk), :][np.diag_indices(batchsize,ndim=2)].reshape(batchsize,beam_width,1) h = h[:,idx//topk, np.mod(idx,topk), :][np.diag_indices(batchsize,ndim=2)].reshape(batchsize,beam_width,-1) c = c[:,idx//topk, np.mod(idx,topk), :][np.diag_indices(batchsize,ndim=2)].reshape(batchsize,beam_width,-1) route[:,:,:j+1] =pred_total_city if (pred_total_city[:,:,j] == 15).all(): break return route[:,0,:j+1].tolist()
def __call__(self, y, t): t_normalized = F.softmax(t) t_log_softmax = F.log_softmax(t) y_log_softmax = F.log_softmax(y) n = y.data.shape[0] return F.sum((t_normalized * t_log_softmax) \ - (t_normalized * y_log_softmax)) / n
def kl_div(self, other): logli = F.log_softmax(self.logits) other_logli = F.log_softmax(other.logits) # new_prob_var = new_dist_info_vars["prob"] # Assume layout is N * A return F.sum( F.exp(logli) * (logli - other_logli), axis=-1 )
def __call__(self, y0, y1): bs = y0.data.shape[0] d = np.prod(y0.data.shape[1:]) y0_softmax = F.softmax(y0) y1_softmax = F.softmax(y1) y0_log_softmax = F.log_softmax(y0) y1_log_softmax = F.log_softmax(y1) kl0 = F.sum(y0_softmax * (y0_log_softmax - y1_log_softmax)) / bs / d kl1 = F.sum(y1_softmax * (y1_log_softmax - y0_log_softmax)) / bs / d return (kl0 + kl1) / 2
def logli(self, a): all_logli = F.log_softmax(self.logits) N = len(a) return all_logli[ np.arange(N), a.data.astype(np.int32, copy=False) ]
def __forward(self, batch_x, batch_t, weight, train=True): xp = self.xp x = Variable(xp.asarray(batch_x), volatile=not train) t = Variable(xp.asarray(batch_t), volatile=not train) y = self.net(x, train=train) b, c, n = y.data.shape mask = Variable(xp.asarray(np.broadcast_to(weight.reshape(-1, 1, 1), (b, c, n)) * loss_mask(batch_t, self.net.rating_num)), volatile=not train) if self.ordinal_weight == 0: loss = F.sum(-F.log_softmax(y) * mask) / b elif self.ordinal_weight == 1: loss = ordinal_loss(y, mask) else: loss = (1 - self.ordinal_weight) * F.sum(-F.log_softmax(y) * mask) / b + self.ordinal_weight * ordinal_loss(y, mask) acc = self.__accuracy(y, t) return loss, acc
def __call__(self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1]) if len(y.shape) > 2: s = np.prod(y.data.shape[2:]) y = F.reshape(y, (bs, d, s)) y = F.transpose(y, (0, 2, 1)) y_normalized = F.softmax(y, use_cudnn=False) y_log_softmax = F.log_softmax(y, use_cudnn=False) self.loss = - F.sum(y_normalized * y_log_softmax) / bs / s else: y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) self.loss = - F.sum(y_normalized * y_log_softmax) / bs / d return self.loss
def __call__(self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1:]) y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) self.loss = - F.sum(y_normalized * y_log_softmax) / bs / d return self.loss
def check_forward(self, x_data, use_cudnn=True): x = chainer.Variable(x_data) y = functions.log_softmax(x, use_cudnn) self.assertEqual(y.data.dtype, numpy.float32) log_z = numpy.ufunc.reduce( numpy.logaddexp, self.x, axis=1, keepdims=True) y_expect = self.x - log_z gradient_check.assert_allclose(y_expect, y.data)
def forward(self, ids, bow): bow, ids = utils.move(self.xp, bow, ids) proportions = self.proportions(ids) ld = dirichlet_likelihood(proportions) doc = F.matmul(F.softmax(proportions), self.factors()) logp = F.dropout(self.embedding(doc)) # loss = -F.sum(bow * F.log_softmax(logp)) sources, targets, counts = [], [], [] lpi = F.sum(bow * F.log_softmax(logp), axis=1) loss = -F.sum(lpi) return loss, ld
def check_forward(self, x_data, use_cudnn='always'): x = chainer.Variable(x_data) with chainer.using_config('use_cudnn', use_cudnn): y = functions.log_softmax(x) self.assertEqual(y.data.dtype, self.dtype) log_z = numpy.ufunc.reduce( numpy.logaddexp, self.x, axis=1, keepdims=True) y_expect = self.x - log_z testing.assert_allclose( y_expect, y.data, **self.check_forward_options)
def predict(self, state, x): """Predict log probabilities for given state and input x using the predictor :param state : the state :param x : the input :return a tuple (state, log prob vector) :rtype cupy/numpy array """ if hasattr(self.predictor, 'normalized') and self.predictor.normalized: return self.predictor(state, x) else: state, z = self.predictor(state, x) return state, F.log_softmax(z).data
def __call__(self, o): log_pi = F.relu(self.h1(o)) log_pi = F.relu(self.h2(log_pi)) log_pi = F.log_softmax(self.h3(log_pi)) probs = F.exp(log_pi)[0] # avoid "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial diff = sum(probs.data[:-1]) - 1 if diff > 0: probs -= (diff + EPS) / A_DIM a = np.random.multinomial(1, probs.data).astype(np.float32) return log_pi, a
def check_forward(self, x_data, use_cudnn='always'): x = chainer.Variable(x_data) with chainer.using_config('use_cudnn', use_cudnn): y = functions.log_softmax(x) self.assertEqual(y.data.dtype, self.dtype) log_z = numpy.ufunc.reduce(numpy.logaddexp, self.x, axis=1, keepdims=True) y_expect = self.x - log_z testing.assert_allclose(y_expect, y.data, **self.check_forward_options)
def __forward(self, batch_x, batch_t, weight, train=True): xp = self.xp x = Variable(xp.asarray(batch_x), volatile=not train) t = Variable(xp.asarray(batch_t), volatile=not train) y = self.net(x, train=train) b, c, n = y.data.shape mask = Variable(xp.asarray( np.broadcast_to(weight.reshape(-1, 1, 1), (b, c, n)) * loss_mask(batch_t, self.net.rating_num)), volatile=not train) if self.ordinal_weight == 0: loss = F.sum(-F.log_softmax(y) * mask) / b elif self.ordinal_weight == 1: loss = ordinal_loss(y, mask) else: loss = (1 - self.ordinal_weight) * F.sum( -F.log_softmax(y) * mask) / b + self.ordinal_weight * ordinal_loss(y, mask) acc = self.__accuracy(y, t) return loss, acc
def __call__(self, x, t, qt=None): # forward z = self.enc(x) e = self.vq(z) e_ = self.vq(chainer.Variable(z.data)) scale = t.shape[2] // e.shape[2] if self.quantize == 'mulaw': y_hat = self.dec(qt, F.unpooling_2d(e, (scale, 1), cover_all=False)) elif self.quantize == 'mixture': y_hat = self.dec(x, F.unpooling_2d(e, (scale, 1), cover_all=False)) # calculate loss if self.quantize == 'mulaw': loss1 = F.softmax_cross_entropy(y_hat, t) elif self.quantize == 'mixture': y_hat = y_hat[:, :30] logit_probs, means, log_scales = F.split_axis(y_hat, 3, 1) log_scales = F.relu(log_scales + 7) - 7 y = F.broadcast_to(t, means.shape) centered_y = y - means inv_stdv = F.exp(-log_scales) plus_in = inv_stdv * (centered_y + 1 / (2**16)) cdf_plus = F.sigmoid(plus_in) min_in = inv_stdv * (centered_y - 1 / (2**16)) cdf_min = F.sigmoid(min_in) log_cdf_plus = plus_in - F.softplus(plus_in) log_one_minus_cdf_min = -F.softplus(min_in) cdf_delta = cdf_plus - cdf_min cdf_delta = F.relu(cdf_delta - 1e-12) + 1e-12 y = F.broadcast_to(t, log_cdf_plus.shape).array log_probs = F.where( y < -0.999, log_cdf_plus, F.where(y > 0.999, log_one_minus_cdf_min, F.log(cdf_delta))) log_probs = log_probs + F.log_softmax(logit_probs) loss1 = -F.mean(log_probs) loss2 = F.mean((chainer.Variable(z.data) - e_)**2) loss3 = self.beta * F.mean((z - chainer.Variable(e.data))**2) loss = loss1 + loss2 + loss3 chainer.reporter.report( { 'loss1': loss1, 'loss2': loss2, 'loss3': loss3, 'loss': loss }, self) return loss1, loss2, loss3
def wer_fun(model, testFeat, normalizeBias): global args # Use decode test data to forward network temp = E.KaldiDict() print('(testing) Forward network', end=" " * 20 + '\r') with chainer.using_config('train', False), chainer.no_backprop_mode(): for utt in testFeat.keys(): data = cp.array(testFeat[utt], dtype=cp.float32) out1, out2 = model(data) out = F.log_softmax(out1, axis=1) out.to_cpu() temp[utt] = out.array - normalizeBias # Tansform KaldiDict to KaldiArk format print('(testing) Transform to ark', end=" " * 20 + '\r') amp = temp.ark # Decoding to obtain a lattice hmm = args.TIMITpath + '/exp/dnn4_pretrain-dbn_dnn_ali_test/final.mdl' hclg = args.TIMITpath + '/exp/tri3/graph/HCLG.fst' lexicon = args.TIMITpath + '/exp/tri3/graph/words.txt' print('(testing) Generate Lattice', end=" " * 20 + '\r') lattice = E.decode_lattice(amp, hmm, hclg, lexicon, args.minActive, args.maxActive, args.maxMemory, args.beam, args.latBeam, args.acwt) # Change language weight from 1 to 10, get the 1best words. print('(testing) Get 1-best words', end=" " * 20 + '\r') outs = lattice.get_1best(lmwt=args.minLmwt, maxLmwt=args.maxLmwt, outFile=args.outDir + '/outRaw.txt') # If reference file is not existed, make it. phonemap = args.TIMITpath + '/conf/phones.60-48-39.map' outFilter = args.TIMITpath + '/local/timit_norm_trans.pl -i - -m {} -from 48 -to 39'.format( phonemap) if not os.path.isfile(args.outDir + '/test_filt.txt'): refText = args.TIMITpath + '/data/test/text' cmd = 'cat {} | {} > {}/test_filt.txt'.format( refText, outFilter, args.outDir) (_, _) = E.run_shell_cmd(cmd) # Score WER and find the smallest one. print('(testing) Score', end=" " * 20 + '\r') minWER = None for k in range(args.minLmwt, args.maxLmwt + 1, 1): cmd = 'cat {} | {} > {}/test_prediction_filt.txt'.format( outs[k], outFilter, args.outDir) (_, _) = E.run_shell_cmd(cmd) os.remove(outs[k]) score = E.wer('{}/test_filt.txt'.format(args.outDir), "{}/test_prediction_filt.txt".format(args.outDir), mode='all') if minWER == None or score['WER'] < minWER: minWER = score['WER'] return minWER
def dirichlet_likelihood(weights, alpha=None): """ Calculate the log likelihood of the observed topic proportions. A negative likelihood is more likely than a negative likelihood. Args: weights (chainer.Variable): Unnormalized weight vector. The vector will be passed through a softmax function that will map the input onto a probability simplex. alpha (float): The Dirichlet concentration parameter. Alpha greater than 1.0 results in very dense topic weights such that each document belongs to many topics. Alpha < 1.0 results in sparser topic weights. The default is to set alpha to 1.0 / n_topics, effectively enforcing the prior belief that a document belong to very topics at once. Returns: ~chainer.Variable: Output loss variable. """ if type(weights) is Variable: n_topics = weights.data.shape[1] else: n_topics = weights.W.data.shape[1] # logger.info('dirichlet_likelihood on topics of {}'.format(n_topics)) if alpha is None: alpha = 1.0 / n_topics if type(weights) is Variable: log_proportions = F.log_softmax(weights) else: log_proportions = F.log_softmax(weights.W) # positive loss = (alpha - 1.0) * log_proportions # negative # return -F.sum(loss) return F.sum(loss)
def __call__( self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1:]) y = F.reshape(y, (bs, d)) y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) self.loss = -F.sum(y_normalized * y_log_softmax) / bs / d return self.loss
def decode(self, sample, bow): """ Decode latent document vectors back into word counts (n_docs, n_vocab). """ logprob = F.log_softmax(self.embedding(sample)) # This is equivalent to a softmax_cross_entropy where instead of # guessing 1 of N words we have repeated observations # Normal softmax for guessing the next word is: # t log softmax(x), where t is 0 or 1 # Softmax for guessing word counts is simply doing # the above more times, so multiply by the count # count log softmax(x) loss = -F.sum(bow * logprob) return loss
def log_softmax(self, hs): """Log_softmax of frame activations. Args: hs (list of chainer.Variable | N-dimension array): Input variable from encoder. Returns: chainer.Variable: A n-dimension float array. """ y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2) return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape( y_hat.shape)
def output_and_loss(self, concat_logit_block, t_block, batch, length): # Output (all together at once for efficiency) rebatch, _ = concat_logit_block.shape # Make target concat_t_block = t_block.reshape((rebatch)).data ignore_mask = (concat_t_block >= 0) n_token = ignore_mask.sum() normalizer = n_token if self.normalize_length else batch if not self.use_label_smoothing: loss = F.softmax_cross_entropy(concat_logit_block, concat_t_block) loss = loss * n_token / normalizer else: p_lsm = self.lsm_weight p_loss = 1. - p_lsm log_prob = F.log_softmax(concat_logit_block) broad_ignore_mask = self.xp.broadcast_to(ignore_mask[:, None], concat_logit_block.shape) pre_loss = ignore_mask * \ log_prob[self.xp.arange(rebatch), concat_t_block] loss = -F.sum(pre_loss) / normalizer label_smoothing = broad_ignore_mask * \ - 1. / self.n_target_vocab * log_prob label_smoothing = F.sum(label_smoothing) / normalizer loss = p_loss * loss + p_lsm * label_smoothing accuracy = F.accuracy(concat_logit_block, concat_t_block, ignore_label=-1) if self.verbose > 0 and self.char_list is not None: with chainer.no_backprop_mode(): rc_block = F.transpose( concat_logit_block.reshape((batch, length, -1)), (0, 2, 1)) rc_block.to_cpu() t_block.to_cpu() for (i, y_hat_), y_true_ in zip(enumerate(rc_block.data), t_block.data): if i == MAX_DECODER_OUTPUT: break idx_hat = np.argmax(y_hat_[:, y_true_ != -1], axis=0) idx_true = y_true_[y_true_ != -1] eos_true = np.where(y_true_ == self.eos)[0][0] seq_hat = [self.char_list[int(idx)] for idx in idx_hat] seq_true = [ self.char_list[int(idx)] for idx in idx_true[:eos_true] ] seq_hat = "".join(seq_hat).replace('<space>', ' ') seq_true = "".join(seq_true).replace('<space>', ' ') logging.info("groundtruth[%d]: " % i + seq_true) logging.info("prediction [%d]: " % i + seq_hat) return loss, accuracy
def __call__(self, x, t): y_list = self.predictor(x) _len, _cls = y_list.shape if self.sm_fuse: _sm = F.reshape(F.log_softmax(y_list), (self.n_kernel, _len // self.n_kernel, _cls)) ave_y = F.average(_sm, axis=0) loss = - F.average(F.select_item(ave_y, t)) else: loss = F.average(F.softmax_cross_entropy(y_list, F.tile(t, self.n_kernel))) conf = F.average( F.reshape(y_list, (self.n_kernel, _len // self.n_kernel, _cls)), axis=0) chainer.report( {'loss': loss, 'accuracy': F.accuracy(conf, t)}, self) return loss
def __call__(self, x_recon, x, enc_hiddens, dec_hiddens, scale=True): """ Parameters ----------------- x_recon: Variable to be reconstructed as label x: Variable to be reconstructed as label enc_hiddens: list of Variable dec_hiddens: list of Varialbe """ kl_recon_loss = 0 # Lateral Recon Loss if self.rc and enc_hiddens is not None: for h0, h1 in zip(enc_hiddens[::-1], dec_hiddens): n = h0.shape[0] d = np.prod(h0.shape[1:]) p = F.softmax(h0) log_p = F.log_softmax(h0) log_q = F.log_softmax(h1) l = F.sum(p * (log_p - log_q)) / n / d kl_recon_loss += l self.loss = kl_recon_loss return self.loss
def __call__(self, states, plies, res, ply_num, train=True): sum_loss = 0 for i in range(len(states)): x = chainer.Variable(self.xp.array([states[i][j] for j in range(ply_num[i])], 'float32')) scores = self.predict(x, train) log_prob = F.log_softmax(scores) # (batch_size, vocab_size) loss = 0 for j in range(ply_num[i]): loss += log_prob[j, plies[i][j]] * res[i] sum_loss += loss / ply_num[i] return - sum_loss / len(states)
def __call__(self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1:]) y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) negentropy = F.sum(y_normalized * y_log_softmax, axis=1) / d #zeros = to_device(np.zeros(bs).astype(np.float32), 2) ones = to_device(-np.ones(bs).astype(np.float32), 2) self.loss = F.sum(F.maximum( Variable(ones), - negentropy)) / bs return self.loss
def __call__( self, y, ): bs = y.data.shape[0] d = np.prod(y.data.shape[1:]) y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) negentropy = F.sum(y_normalized * y_log_softmax, axis=1) / d #zeros = to_device(np.zeros(bs).astype(np.float32), 2) ones = to_device(-np.ones(bs).astype(np.float32), 2) self.loss = F.sum(F.maximum(Variable(ones), -negentropy)) / bs return self.loss
def get_action(self, z, m): # assert m.shape == (1, M_DIM * Krp) self.m = m state = F.concat((z.data, self.h, m)) # Stop gradients wrt z. state = F.tanh(self.pi1(state)) log_pi = F.log_softmax( self.pi2(state)) # log_softmax may be more stable. probs = F.exp(log_pi)[0] # avoid "ValueError: sum(pvals[:-1]) > 1.0" in numpy.multinomial diff = sum(probs.data[:-1]) - 1 if diff > 0: probs -= (diff + np.finfo(np.float32).epsneg) / (A_DIM - 1) a = np.random.multinomial(1, probs.data).astype(np.float32) # onehot return log_pi, a
def forward(self, s): # s: batch_size x board_x x board_y s = F.reshape(s, (-1, 1, self.board_x, self.board_y)) # batch_size x 1 x board_x x board_y s = F.relu(self.bn1(self.conv1(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn2(self.conv2(s))) # batch_size x num_channels x board_x x board_y s = F.relu(self.bn3(self.conv3(s))) # batch_size x num_channels x (board_x-2) x (board_y-2) s = F.relu(self.bn4(self.conv4(s))) # batch_size x num_channels x (board_x-4) x (board_y-4) s = F.reshape(s, (-1, self.args.num_channels*(self.board_x-4)*(self.board_y-4))) s = F.dropout(F.relu(self.fc_bn1(self.fc1(s))), ratio=self.args.dropout) # batch_size x 1024 s = F.dropout(F.relu(self.fc_bn2(self.fc2(s))), ratio=self.args.dropout) # batch_size x 512 pi = self.fc3(s) # batch_size x action_size v = self.fc4(s) # batch_size x 1 return F.log_softmax(pi, axis=1), F.tanh(v)
def predict(self, xs): """ batch: list of splitted sentences """ batchsize = len(xs) xs = [self.extractor.process(x) for x in xs] ws, ss, ps = concat_examples(xs, padding=IGNORE) cat_ys, dep_ys = self.forward(ws, ss, ps) cat_ys = F.transpose(F.stack(cat_ys, 2), (0, 2, 1)) dep_ys = F.transpose(F.stack(dep_ys, 2), (0, 2, 1)) cat_ys = [F.squeeze(y, 0).data[1:len(x) + 1] for x, y in \ zip(xs, F.split_axis(cat_ys, batchsize, 0))] dep_ys = [F.squeeze(F.log_softmax(y[1:len(x) + 1, :-1]), 0).data \ for x, y in zip(xs, F.split_axis(dep_ys, batchsize, 0))] return cat_ys, dep_ys
def metric(self, model, images, labels): batchsize = len(images) embeddings = model(images) embeddings = F.reshape(embeddings, ((batchsize, -1))) shape = embeddings.shape metric = 0 for embedding in embeddings: eculideans = F.sum( (embeddings - F.broadcast_to(embedding, (batchsize, shape[1])))**2, axis=1) ratios = -F.log_softmax(F.expand_dims(-eculideans, axis=0))[0] weights = F.softmax(F.expand_dims(-eculideans, axis=0))[0] metric += F.sum(ratios * weights) chainer.report({'metric': metric}, model) return metric
def metric(self, model, images, labels): xp = cupy.get_array_module(images) batchsize = len(images) embeddings = model(images) embeddings = F.reshape(embeddings, ((batchsize, -1))) shape = embeddings.shape metric = 0 for embedding, label in zip(embeddings, labels): eculideans = F.sum( (embeddings - F.broadcast_to(embedding, (batchsize, shape[1])))**2, axis=1) ratios = -F.log_softmax(F.expand_dims(-eculideans, axis=0))[0] metric += F.sum(ratios[xp.where(labels == label)]) chainer.report({'metric': metric}, model) return metric
def loss(self, x, target): xp = chainer.cuda.get_array_module(target) logit = F.softmax(x) logit = F.clip(logit, x_min=self.eps, x_max=1-self.eps) if self.ls == False: loss_ce = F.softmax_cross_entropy(x, target) else: oh_target = xp.eye(self.class_num)[target] ls_target = self.label_smoothing(oh_target, epsilon=0.1, xp=xp) loss_ce = -F.sum(F.log_softmax(x) * ls_target) / ls_target.shape[0] self.pc = self.pc*0.95 + F.mean(logit, axis=0).data*0.05 k = self.h * self.pc + (1 - self.h) gamma = F.log(1 - k) / F.log(1 - self.pc) - 1 loss_focal = loss_ce * self.alpha * (1 - logit) ** gamma return F.mean(loss_focal)
def beam(self, xs, ys, maxlen, beamsize, n_cands, ranking): with chainer.no_backprop_mode(), chainer.using_config('train', False): h, c, oxs = self.encoder.nstep(xs, reverse=self.reverse) # Initiarization ht = [self.xp.zeros(self.units, 'f').reshape(1, self.units)] \ if self.feeding else None que = [(0.0, [BOS_ID], h, c, ht)] # Beam search for _ in range(maxlen): if all(map(lambda s: s[1][-1] == EOS_ID, que)): break new_que = [] for score, seq, h, c, ht in que: if seq[-1] == EOS_ID: new_que.append((score, seq, h, c, ht)) else: # decode w = self.xp.array([seq[-1]], self.xp.int32) if self.use_attn: h, c, o, ht = self.decoder.onestep(w, h, c, oxs, ht) else: h, c, o = self.decoder.onestep(w, h, c) o = -F.log_softmax(o) nbest_ids = get_argnbest(o, beamsize)[0] # calclate log likelihood for index in nbest_ids: new_score = score + float(o[0][index].data) new_seq = seq + [index] new_que.append((new_score, new_seq, h, c, ht)) # sort in the new_queue of the higher likelihood new_que.sort(key=lambda x: x[0]/(len(x[1]) - 1)) que = new_que[:beamsize] # Remove EOS and BOS tags hyps = [que[i][1][1:-1] if que[i][1][-1] == EOS_ID else que[i][1][1:] for i in range(beamsize)] # ranking if ranking == 'sbleu': hyps = self.sbleu_ranking(hyps, ys) return hyps[:n_cands]
def test(net, inputs, test_token_len, beam_width=10): xp = net.xp from_sentences = [] sentences = [] for xs in inputs: net.reset_state() for raw_x in xs.data: x = xp.full((beam_width,), raw_x, dtype=np.int32) x = chainer.Variable(x, volatile=True) net(x, decode=False, train=False) candidates = [(None, [begin_id], 0)] for i in six.moves.range(test_token_len): next_candidates = [] current_candidates = [] x = [] for sub_state, tokens, likelihood in candidates: if tokens[-1] == end_id: continue if sub_state != None: net.set_sub_state(len(x), sub_state) current_candidates.append((len(x), tokens, likelihood)) x.append(tokens[-1]) x = chainer.Variable(xp.asarray(x, dtype=np.int32), volatile=True) y = F.log_softmax(net(x, decode=True, train=False)) for j, tokens, likelihood in current_candidates: sub_state = net.get_sub_state(j) token_likelihoods = cuda.to_cpu(y.data[0]) top_tokens = token_likelihoods.argsort()[-beam_width:] next_candidates.extend([(sub_state, tokens + [j], likelihood + token_likelihoods[j]) for j in top_tokens]) candidates = sorted(next_candidates, key=lambda x: -x[2])[:beam_width] if all([candidate[1][-1] == end_id for candidate in candidates]): break sentences.append(candidates[0][1][1:-1]) return sentences for xs in inputs: while len(tokens) < test_token_len: token_id = chainer.Variable(xp.asarray([token_id], dtype=np.int32), volatile=True) y = net(token_id, decode=True, train=False) token_id = int(xp.argmax(y.data[0])) if token_id == end_id: break tokens.append(token_id) sentences.append(tokens) return sentences
def output_and_loss(self, h_block, t_block): batch, units, length = h_block.shape # Output (all together at once for efficiency) concat_logit_block = seq_func(self.output, h_block, reconstruct_shape=False) rebatch, _ = concat_logit_block.shape # Make target concat_t_block = t_block.reshape((rebatch)) ignore_mask = (concat_t_block >= 0) n_token = ignore_mask.sum() normalizer = n_token # n_token or batch or 1 # normalizer = 1 if not self.use_label_smoothing: loss = F.softmax_cross_entropy(concat_logit_block, concat_t_block) loss = loss * n_token / normalizer else: log_prob = F.log_softmax(concat_logit_block) broad_ignore_mask = self.xp.broadcast_to(ignore_mask[:, None], concat_logit_block.shape) pre_loss = ignore_mask * \ log_prob[self.xp.arange(rebatch), concat_t_block] loss = -F.sum(pre_loss) / normalizer accuracy = F.accuracy(concat_logit_block, concat_t_block, ignore_label=-1) perp = self.xp.exp(loss.data * normalizer / n_token) # Report the Values reporter.report( { 'loss': loss.data * normalizer / n_token, 'acc': accuracy.data, 'perp': perp }, self) if self.use_label_smoothing: label_smoothing = broad_ignore_mask * \ - 1. / self.n_target_vocab * log_prob label_smoothing = F.sum(label_smoothing) / normalizer loss = 0.9 * loss + 0.1 * label_smoothing return loss
def recognize(self, x_block, recog_args, char_list=None, rnnlm=None): '''E2E beam search :param ndarray x: input acouctic feature (B, T, D) or (T, D) :param namespace recog_args: argment namespace contraining options :param list char_list: list of characters :param torch.nn.Module rnnlm: language model module :return: N-best decoding results :rtype: list ''' xp = self.xp with chainer.no_backprop_mode(), chainer.using_config('train', False): ilens = [x_block.shape[0]] batch = len(ilens) xs, x_mask, ilens = self.encoder(x_block[None, :, :], ilens) logging.info('Encoder size: ' + str(xs.shape)) if recog_args.ctc_weight > 0.0: raise NotImplementedError( 'use joint ctc/tranformer decoding. WIP') if recog_args.beam_size == 1: logging.info('Use greedy search implementation') ys = xp.full((1, 1), self.sos) score = xp.zeros(1) maxlen = xs.shape[1] + 1 for step in range(maxlen): yy_mask = self.make_attention_mask(ys, ys) yy_mask *= self.make_history_mask(ys) xy_mask = self.make_attention_mask(ys, xp.array(x_mask)) out = self.decoder(ys, yy_mask, xs, xy_mask).reshape(batch, -1, self.odim) prob = F.log_softmax(out[:, -1], axis=-1) max_prob = prob.array.max(axis=1) next_id = F.argmax(prob, axis=1).array.astype(np.int64) score += max_prob if step == maxlen - 1: next_id[0] = self.eos ys = F.concat((ys, next_id[None, :]), axis=1).data if next_id[0] == self.eos: break nbest_hyps = [{"score": score, "yseq": ys[0].tolist()}] else: raise NotImplementedError( 'use beam search implementation. WIP') return nbest_hyps
def log_propensity_independent(self, x, action): xp = cuda.get_array_module(action) pred = self._predict(x) final_action = action if self.k > 0 and action.shape[1] < pred.shape[1]: all_actions = F.broadcast_to(xp.arange(0, pred.shape[1], dtype=action.data.dtype), pred.shape) inv_items = inverse_select_items_per_row(all_actions, action) items = select_items_per_row(all_actions, action) final_action = F.concat((items, inv_items), axis=1) pred = select_items_per_row(pred, final_action) results = F.log_softmax(pred) if self.k > 0: results = results[:, :self.k] return results
def sample(self, vis_feats, temperature=1, stochastic=True): xp = cuda.get_array_module(vis_feats) batch_size = vis_feats.shape[0] self.LSTM_initialize() output = xp.zeros((batch_size, self.seq_length), dtype=xp.int32) log_probs = [] mask = xp.ones(batch_size) with chainer.using_config('train', False): for i in range(self.seq_length): if i == 0: sos = self.word_emb( Variable( xp.ones(batch_size, dtype=xp.int32) * (self.vocab_size + 1))) _, h = self.LSTM(vis=vis_feats, sos=sos) else: mask_ = xp.where(w != 0, 1, 0) mask *= mask_ if mask.sum() == 0: break w = self.word_emb(Variable(w)) _, h = self.LSTM(vis=vis_feats, word=w) h = self.out(h) logsoft = F.log_softmax(h) * mask.reshape( batch_size, 1).repeat(h.data.shape[1], axis=1) # if input==eos then mask if stochastic: prob_prev = F.exp(logsoft / temperature) prob_prev /= F.broadcast_to( F.sum(prob_prev, axis=1, keepdims=True), prob_prev.shape) w = softmax_sample(prob_prev) else: w = xp.argmax(logsoft.data, axis=1) output[:, i] = w log_probs.append(logsoft[np.arange(batch_size), w].reshape(1, batch_size)) return output, F.concat(log_probs, axis=0)
def Fissher(self, imageset, shape, gpu, num_samples): if gpu >= 0: xp = cp else: xp = np num_samples = num_samples self.F_accum = [] for v in range(len(self.var_list)): self.F_accum.append(xp.zeros(self.var_list[v].data.shape)) for i in range(num_samples): c, w, h = shape x = np.ndarray((1, c, w, h), dtype=np.float32) y = np.ndarray((1, ), dtype=np.int32) rnd = np.random.randint(len(imageset)) path = imageset[rnd][0] label = imageset[rnd][1] x[0] = np.array(path) y[0] = np.array(label) if gpu >= 0: x = cuda.to_gpu(x) y = cuda.to_gpu(y) x = chainer.Variable(x) y = chainer.Variable(y) probs = F.log_softmax(self.predict(x, y)) class_ind = np.argmax(cuda.to_cpu(probs.data)) loss = probs[0, class_ind] self.cleargrads() loss.backward() for v in range(len(self.F_accum)): self.F_accum[v] += xp.square(self.var_list[v].grad) # divide totals by number of samples for v in range(len(self.F_accum)): self.F_accum[v] /= num_samples print "Fii", self.F_accum[0]
def softmax_cross_entropy(self, y, t): import numpy as np log_softmax = F.log_softmax(y) # SelectItem is not supported by onnx-chainer. # TODO(hamaji): Support it? # log_prob = F.select_item(log_softmax, t) # TODO(hamaji): Currently, F.sum with axis=1 cannot be # backpropped properly. # log_prob = F.sum(log_softmax * t, axis=1) # self.batch_size = chainer.Variable(np.array(t.size, np.float32), # name='batch_size') # return -F.sum(log_prob, axis=0) / self.batch_size log_prob = F.sum(log_softmax * t, axis=(0, 1)) batch_size = chainer.Variable(np.array(t.shape[0], np.float32), name='batch_size') self.extra_inputs = [batch_size] loss = -log_prob / batch_size loss.name = 'loss' return loss
def compute_fisher(self, dataset): fisher_accum_list = [ np.zeros(var[1].shape) for var in self.variable_list ] for _ in range(self.num_samples): x, _ = dataset[np.random.randint(len(dataset))] y = self.predictor(np.array([x])) prob_list = F.softmax(y)[0].data class_index = np.random.choice(len(prob_list), p=prob_list) loss = F.log_softmax(y)[0, class_index] self.cleargrads() loss.backward() for i in range(len(self.variable_list)): fisher_accum_list[i] += np.square( self.variable_list[i][1].grad) self.fisher_list = [ F_accum / self.num_samples for F_accum in fisher_accum_list ] return self.fisher_list
def forward(self, inputs, device): x, = inputs return functions.log_softmax(x, axis=self.axis),
def forward(self): x = chainer.Variable(self.x) return functions.log_softmax(x)
def f(x): return functions.log_softmax(x, self.axis)
def kl_loss(xp, p_logit, q_logit): p = F.softmax(p_logit) _kl = F.sum(p * (F.log_softmax(p_logit) - F.log_softmax(q_logit)), 1) return F.sum(_kl) / xp.prod(xp.array(_kl.shape))
def __call__(self, y): y_normalized = F.softmax(y) y_log_softmax = F.log_softmax(y) n = y.data.shape[0] return - F.sum(y_normalized * y_log_softmax) / n
def forward(self): x = chainer.Variable(self.x) return functions.log_softmax(x, use_cudnn=self.use_cudnn)
def __call__(self, y): s = F.softmax(y) log_s = F.log_softmax(y) N = s.data.shape[0] # - * - is + due to maximizing entropy return F.sum(s*log_s) / N # over batch
def log_probs(self): return F.log_softmax(self.logits)
def __call__(self, x): return F.log_softmax(x, self.use_cudnn)
def entropy(self): logli = F.log_softmax(self.logits) return F.sum(-logli * F.exp(logli), axis=-1)