def _gumbel_softmax(self, logits, tau=0.67, eps=1e-10): u = layers.uniform_random_batch_size_like( logits, shape=[-1, self.latent_type_size], min=0.0, max=1.0) u.stop_gradient = True gumbel = 0.0 - layers.log(eps - layers.log(u + eps)) y = logits + gumbel return layers.softmax(y / tau)
def gumbel_softmax(input, tau=1, eps=1e-10): """ Basic implement of gumbel_softmax. """ U = fluid.dygraph.to_variable(np.random.rand(*input.shape)) # U = layers.uniform_random(input.shape, dtype=input.dtype, min=0.0, max=1.0) # U.stop_gradient = True gumbel = 0.0 - layers.log(eps - layers.log(U + eps)) y = input + gumbel return layers.softmax(y / tau)
def gumbel_softmax(logits, tau=0.67, eps=1e-10): """Gumbel softmax.""" u = layers.uniform_random_batch_size_like( logits, shape=[-1, logits.shape[1]], min=0.0, max=1.0) u.stop_gradient = True gumbel = 0.0 - layers.log(eps - layers.log(u + eps)) y = logits + gumbel return layers.softmax(y / tau)
def create_model(args, config): """Create model for given model configuration.""" logging.info('building model') graph_wrapper = GraphWrapper(name="graph", node_feat=[('atom_type', [None, 1], "int64"), ('chirality_tag', [None, 1], "int64")], edge_feat=[('bond_type', [None, 1], "int64"), ('bond_direction', [None, 1], "int64")]) # NOTE: [num_nodes, num_graphs], bs = num_graphs pos_mask = L.data(name='pos_mask', shape=[-1, args.batch_size], dtype='float32') neg_mask = L.data(name='neg_mask', shape=[-1, args.batch_size], dtype='float32') encoder = GINEncoder(config) global_repr, patch_summary = encoder.forward(graph_wrapper) global_D = FF(encoder.embedding_dim) local_D = FF(encoder.embedding_dim) g_enc = global_D.forward(global_repr) l_enc = local_D.forward(patch_summary) res = L.matmul(l_enc, g_enc, transpose_y=True) E_pos = get_positive_expectation(res * pos_mask, config['measure'], average=False) E_pos = L.reduce_sum(E_pos) / graph_wrapper.num_nodes E_neg = get_negative_expectation(res * neg_mask, config['measure'], average=False) E_neg = L.reduce_sum(E_neg) / (graph_wrapper.num_nodes * (graph_wrapper.num_graph - 1)) local_global_loss = E_neg - E_pos if config['prior']: prior_D = PriorDiscriminator(encoder.embedding_dim) prior = L.uniform_random([args.batch_size, encoder.embedding_dim], min=0.0, max=1.0) term_1 = L.reduce_mean(L.log(prior_D.forward(prior))) term_2 = L.reduce_mean(L.log(1.0 - prior_D.forward(global_repr))) prior_loss = -(term_1 + term_2) * config['gamma'] else: prior_loss = 0 total_loss = local_global_loss + prior_loss keys = ['loss', 'graph_wrapper', 'encoder', 'graph_emb'] Agent = namedtuple('Agent', keys) return Agent(loss=total_loss, graph_wrapper=graph_wrapper, encoder=encoder, graph_emb=global_repr)
def _collect_metrics(self, inputs, outputs): """ Calculate loss function by using inputs and outputs. """ metrics = {} tgt_len = layers.reduce_sum( layers.reduce_sum(inputs["tgt_mask"], dim=1) - 1) tgt_len.stop_gradient = True label = inputs["tgt_token"][:, 1:] if self.label_smooth > 0: one_hot_label = layers.one_hot(label, self.num_token_embeddings) smooth_label = layers.label_smooth(one_hot_label, epsilon=self.label_smooth, dtype=self._dtype) nll = layers.cross_entropy(outputs["dec_pred"], smooth_label, soft_label=True, ignore_index=self.padding_idx) else: nll = layers.cross_entropy(outputs["dec_probs"], label, ignore_index=self.padding_idx) nll = layers.reduce_sum(nll, dim=1) token_nll = layers.reduce_sum(nll) / tgt_len nll = layers.reduce_mean(nll) metrics["nll"] = nll metrics["token_nll"] = token_nll loss = nll if self.num_latent > 0 and self.with_bow: bow_probs = F.unsqueeze(outputs["bow_probs"], [1]) bow_probs = layers.expand(bow_probs, [1, label.shape[1], 1]) if self.label_smooth > 0: bow = layers.cross_entropy(bow_probs, smooth_label, soft_label=True, ignore_index=self.padding_idx) else: bow = layers.cross_entropy(bow_probs, label, ignore_index=self.padding_idx) bow = layers.reduce_sum(bow, dim=1) token_bow = layers.reduce_sum(bow) / tgt_len bow = layers.reduce_mean(bow) metrics["bow"] = bow metrics["token_bow"] = token_bow loss = loss + bow if self.num_latent > 0 and self.use_discriminator: dis = 0.0 - (layers.log(outputs["pos_probs"]) + layers.log(1.0 - outputs["neg_probs"])) dis = layers.reduce_mean(dis) metrics["dis"] = dis loss = loss + dis * self.dis_ratio metrics["loss"] = loss metrics["token_num"] = tgt_len return metrics
def focal_loss(y_predict, y, alpha=0.85, gamma=2, epsilon=1e-6): ''' alpha 变大,对前景类惩罚变大,更加重视 gamma 变大,对信心大的例子更加忽略,学习难的例子 ''' y = fluid.layers.clip(y, epsilon, 1 - epsilon) y_predict = fluid.layers.clip(y_predict, epsilon, 1 - epsilon) return -1 * (alpha * pow((1 - y_predict), gamma) * y * log(y_predict) + (1 - alpha) * pow(y_predict, gamma) * (1 - y) * log(1 - y_predict))
def focal_loss(pred, label, alpha=0.25, gamma=2, epsilon=1e-6): ''' alpha 变大,对前景类惩罚变大,更加重视 gamma 变大,对信心大的例子更加忽略,学习难的例子 ''' pred = clip(pred, epsilon, 1 - epsilon) label = clip(label, epsilon, 1 - epsilon) loss = -1 * (alpha * layers.pow( (1 - pred), gamma) * label * layers.log(pred) + (1 - alpha) * layers.pow(pred, gamma) * (1 - label) * log(1 - pred)) return loss
def sigmoid_focal_loss(self, x, label, fg_num, gamma=2.0, alpha=0.25): C = x.shape[1] eye = paddle.eye(C + 1, dtype='float32') one_hot = L.gather(eye, label) pos_mask = one_hot[:, 1:] # 正样本掩码 p = L.sigmoid(x) # [批大小*所有格子数, 80], 预测的类别概率 pos_loss = pos_mask * (0 - L.log(p + 1e-9)) * L.pow(1 - p, gamma) * alpha neg_loss = (1.0 - pos_mask) * (0 - L.log(1 - p + 1e-9)) * L.pow( p, gamma) * (1 - alpha) focal_loss = pos_loss + neg_loss if fg_num > 0.5: # 当没有gt时,即fg_num==0时,focal_loss什么都不除。 focal_loss = focal_loss / fg_num return focal_loss
def _grammar_step(self, logits, next_cell_states, decode_states, actions, gmr_mask): """跟进文法约束完成一步解码逻辑 Args: logits (Variable): shape = [batch_size, beam_size, vocab_size] next_cell_states (Variable): NULL decode_states (StateWrapper): NULL Returns: TODO Raises: NULL """ # 解码出符合语法规则的 token logits logits, valid_table_mask = self._output_layer(logits, actions, gmr_mask, decode_states.valid_table_mask) # 初始化 vocab size self._vocab_size = logits.shape[-1] self._vocab_size_tensor = layers.fill_constant(shape=[1], dtype='int64', value=logits.shape[-1]) # 计算 log probs,并 mask 掉 finished 部分 step_log_probs = layers.log(layers.softmax(logits)) step_log_probs = self._mask_finished_probs(step_log_probs, decode_states.finished) scores = layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]) topk_scores, topk_indices = layers.topk(input=scores, k=self._beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_indices = layers.reshape(topk_indices, shape=[-1]) # top-k 对应的 beam beam_indices = layers.elementwise_floordiv(topk_indices, self._vocab_size_tensor) # top-k 对应的 token id token_indices = layers.elementwise_mod(topk_indices, self._vocab_size_tensor) # 根据 top k 的来源,重新组织 step_log_probs next_log_probs = nn_utils.batch_gather( layers.reshape(step_log_probs, [-1, self._beam_size * self._vocab_size]), topk_indices) def _beam_gather(x, beam_indices): """reshape x to beam dim, and gather each beam_indices Args: x (TYPE): NULL Returns: Variable """ x = self.split_batch_beams(x) return nn_utils.batch_gather(x, beam_indices) next_cell_states = layers.utils.map_structure(lambda x: _beam_gather(x, beam_indices), next_cell_states) next_finished = _beam_gather(decode_states.finished, beam_indices) next_lens = _beam_gather(decode_states.lengths, beam_indices) next_lens = layers.elementwise_add(next_lens, layers.cast(layers.logical_not(next_finished), next_lens.dtype)) next_finished = layers.logical_or(next_finished, layers.equal(token_indices, self._end_token_tensor)) decode_output = OutputWrapper(topk_scores, token_indices, beam_indices) decode_states = StateWrapper(next_cell_states, next_log_probs, next_finished, next_lens, valid_table_mask) return decode_output, decode_states
def get_embedding(self, num_embeddings, embedding_dim, padding_idx=None): """ Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". """ half_dim = embedding_dim // 2 emb = layers.log(float(10000)) / (half_dim - -1) emb = layers.exp(layers.arange( start=0, end=half_dim, dtype='float32') * -emb) # [num_embeddings, embedding_dim // 2] emb = layers.unsqueeze(layers.arange(-num_embeddings // 2, num_embeddings // 2, dtype='float32'), axis=1) *\ layers.unsqueeze(emb, axis=0) emb = layers.concat([layers.sin(emb), layers.cos(emb)], dim=1) # [num_embeddings, embedding_dim] if embedding_dim % 2 == 1: emb = layers.concat( [emb, layers.zeros(shape=(num_embeddings, 1))], dim=1) if padding_idx is not None: emb[paddings_idx, :] = 0 self.origin_shift = num_embeddings // 2 return emb
def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_feed, pre_hidden_array, pre_cell_array, enc_memory) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape(topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array
def create_loss_op(self, predict, label, epsilon=1e-7): """compute loss with tensor Args: predict: model output tensor activated by softmax label: a non-sparse tensor Returns: loss: cross-entropy loss """ if self.loss_type == "nl" and self.model_type == "train": one_hot_label = fluid.one_hot(label, depth=predict.shape[-1]) one_hot_label = FL.squeeze(one_hot_label, axes=[-2]) # log neg_prob = 1 - predict log_neg_prob = FL.log( fluid.layers.clip(neg_prob, min=epsilon, max=1.)) ce_loss = -1 * log_neg_prob * one_hot_label cost = FL.reduce_sum(ce_loss, dim=-1, keep_dim=True) else: # PL or evaluation cost = FL.cross_entropy(predict, label) loss = FL.mean(cost) return loss
def ce_conf_loss(self, pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2, gt_area): labels_pos_cid2 = P.reshape(labels_pos_cid2, (-1, )) # [batch_size*num_priors] pred_allboxes_conf_r = P.reshape( pred_allboxes_conf, (-1, P.shape(pred_allboxes_conf)[2] )) # [batch_size*num_priors, num_classes] label_prob = P.gather( class_vectors, labels_pos_cid2) # one-hot掩码 (batch_size*num_priors, num_classes) pred_prob = P.softmax(pred_allboxes_conf_r) pred_prob = P.cast(pred_prob, 'float32') prob_loss = label_prob * (0 - P.log(pred_prob + 1e-9)) # 加了极小的常数防止nan prob_loss = P.reduce_sum(prob_loss, dim=1) # 只留下正反例的损失 labels_pos_mask2 = P.reshape(labels_pos_mask, (-1, )) # [batch_size*num_priors] labels_neg_mask2 = P.reshape(labels_neg_mask, (-1, )) # [batch_size*num_priors] conf_loss_scale = 2.0 - gt_area # gt面积越小,权重越大,越受重视 conf_loss_scale = P.reshape(conf_loss_scale, (-1, )) # [batch_size*num_priors] prob_pos_loss = prob_loss * labels_pos_mask2 * conf_loss_scale prob_neg_loss = prob_loss * labels_neg_mask2 ce_loss = prob_pos_loss + prob_neg_loss ce_loss = P.reduce_sum(ce_loss) return ce_loss
def grow_topk(i, logits, alive_seq, alive_log_probs, states): logits = layers.reshape(logits, [batch_size, beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = layers.elementwise_add(candidate_log_probs, alive_log_probs, 0) length_penalty = np.power(5.0 + (i + 1.0) / 6.0, alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=beam_size * 2) topk_log_probs = topk_scores * length_penalty topk_beam_index = topk_ids // self.trg_vocab_size topk_ids = topk_ids % self.trg_vocab_size # use gather as gather_nd, TODO: use gather_nd topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index, beam_size, batch_size) topk_seq = layers.concat( [topk_seq, layers.reshape(topk_ids, topk_ids.shape + [1])], axis=2) states = update_states(states, topk_beam_index, beam_size) eos = layers.fill_constant(shape=topk_ids.shape, dtype="int64", value=eos_id) topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32") #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, states
def chunk_softmax(logits, labels, topk=10): after_exp = L.exp(logits) out, _ = L.argsort(after_exp, axis=-1) denorm = L.reduce_sum(out[:, -topk:], dim=-1, keep_dim=True) probs = after_exp / denorm one_hot = F.one_hot(labels, depth=probs.shape[-1]) loss = -L.reduce_sum(one_hot * L.log(probs)) / logits.shape[0] return loss
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def body_func(step_idx, pre_ids, pre_scores, gather_idx, caches, trg_src_attn_bias): # gather cell states corresponding to selected parent pre_caches = map_structure( lambda x: layers.gather(x, index=gather_idx), caches) pre_src_attn_bias = layers.gather(trg_src_attn_bias, index=gather_idx) pre_pos = layers.elementwise_mul( x=layers.fill_constant_batch_size_like( input=pre_src_attn_bias, # cann't use lod tensor here value=1, shape=[-1, 1], dtype=pre_ids.dtype), y=step_idx, axis=0) logits = wrap_decoder((pre_ids, pre_pos, None, pre_src_attn_bias), trg_vocab_size, max_in_len, n_layer, n_head, d_key, d_value, d_model, d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing, enc_output=enc_output, caches=pre_caches, bos_idx=bos_idx) # intra-beam topK topk_scores, topk_indices = layers.topk( input=layers.softmax(logits), k=beam_size) accu_scores = layers.elementwise_add(x=layers.log(topk_scores), y=pre_scores, axis=0) # beam_search op uses lod to differentiate branches. accu_scores = layers.lod_reset(accu_scores, pre_ids) # topK reduction across beams, also contain special handle of # end beams and end sentences(batch reduction) selected_ids, selected_scores, gather_idx = layers.beam_search( pre_ids=pre_ids, pre_scores=pre_scores, ids=topk_indices, scores=accu_scores, beam_size=beam_size, end_id=eos_idx, return_parent_idx=True) step_idx = layers.increment(x=step_idx, value=1.0, in_place=False) layers.array_write(selected_ids, i=step_idx, array=ids) layers.array_write(selected_scores, i=step_idx, array=scores) return (step_idx, selected_ids, selected_scores, gather_idx, pre_caches, pre_src_attn_bias)
def weighed_binary_cross_entropy(y, y_predict, beta=2, epsilon=1e-6): """ 返回 wce loss beta标记的是希望positive类给到多少的权重,如果positive少,beta给大于1相当与比0的类更重视 """ y = fluid.layers.clip(y, epsilon, 1 - epsilon) y_predict = fluid.layers.clip(y_predict, epsilon, 1 - epsilon) ylogp = fluid.layers.elementwise_mul(y, log(y_predict)) betas = fluid.layers.fill_constant(ylogp.shape, "float32", beta) ylogp = fluid.layers.elementwise_mul(betas, ylogp) ones = fluid.layers.fill_constant(y_predict.shape, "float32", 1) ylogp = fluid.layers.elementwise_add( ylogp, elementwise_mul(elementwise_sub(ones, y), log(elementwise_sub(ones, y_predict)))) zeros = fluid.layers.fill_constant(y_predict.shape, "float32", 0) return fluid.layers.elementwise_sub(zeros, ylogp)
def _de_sigmoid(x, eps=1e-7): # x限制在区间[eps, 1 / eps]内 x = L.clip(x, eps, 1 / eps) # 先取倒数再减一 x = 1.0 / x - 1.0 # e^(-x)限制在区间[eps, 1 / eps]内 x = L.clip(x, eps, 1 / eps) # 取对数再取相反数 x = -L.log(x) return x
def focal_conf_loss(self, pred_allboxes_conf, labels_pos_mask, labels_neg_mask, class_vectors, labels_pos_cid2, focal_loss_alpha=0.25, focal_loss_gamma=2): labels_pos_cid2 = P.reshape(labels_pos_cid2, (-1, )) # [batch_size*num_priors] pred_allboxes_conf_r = P.reshape( pred_allboxes_conf, (-1, P.shape(pred_allboxes_conf)[2] )) # [batch_size*num_priors, num_classes] label_prob = P.gather( class_vectors, labels_pos_cid2) # one-hot掩码 (batch_size*num_priors, num_classes) # 我们可以在训练时改为sigmoid激活,预测时依然还是softmax激活。 # 能这么做的原因是,若某位的sigmoid值最大,那么一定有该位的softmax值最大。 pred_prob = P.sigmoid(pred_allboxes_conf_r) pred_prob = P.cast(pred_prob, 'float32') # focal_loss labels_pos_mask2 = P.reshape(labels_pos_mask, (-1, )) # [batch_size*num_priors] labels_neg_mask2 = P.reshape(labels_neg_mask, (-1, )) # [batch_size*num_priors] prob_pos_loss = label_prob * ( 0 - P.log(pred_prob + 1e-9)) * focal_loss_alpha * ( 1.0 - pred_prob)**focal_loss_gamma prob_neg_loss = (1 - label_prob) * ( 0 - P.log(1 - pred_prob + 1e-9)) * ( 1.0 - focal_loss_alpha) * pred_prob**focal_loss_gamma focal_loss = prob_pos_loss + prob_neg_loss focal_loss = P.reduce_sum(focal_loss, dim=1) focal_loss = focal_loss * (labels_pos_mask2 + labels_neg_mask2) focal_loss = P.reduce_sum(focal_loss) return focal_loss
def log_sum_exp(x): """预测为背景的概率是(axx是神经网络的输出) p = e^(a00-max)/[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 取对数 lnp = a00-max-ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 移项 a00 = lnp + max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 如果真的是背景类,标记p=1,所以 a00 = max + ln[e^(a00-max)+e^(a01-max)+...+e^(a80-max)] 神经网络的输出要尽量接近等号右边,才能预测为背景类。 """ x_max = P.reduce_max(x) return P.log(P.reduce_sum(P.exp(x - x_max), 1)) + x_max
def _get_metrics(self, inputs, outputs): metrics = super(Plato, self)._get_metrics(inputs, outputs) if self.use_bow: fc_out = self._calc_bow_logits(outputs["enc_out"], inputs["bow_pos"]) bow_loss = layers.softmax_with_cross_entropy( logits=fc_out, label=inputs["bow_label"]) mean_bow_loss = layers.mean(bow_loss) metrics["bow_loss"] = mean_bow_loss metrics["loss"] = metrics["loss"] + mean_bow_loss entropy_loss = layers.reduce_sum(outputs["post_probs"] * layers.log(outputs["post_probs"]), dim=1) mean_entropy_loss = layers.mean(entropy_loss) metrics["entropy_loss"] = mean_entropy_loss if self.use_entropy: metrics["loss"] = metrics["loss"] + mean_entropy_loss return metrics
def func(self, place): shape = [2, 3, 7, 9] eps = 1e-6 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.log(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check([x], y, x_init=x_arr, place=place, eps=eps)
def func(self, place): shape = [2, 3, 7, 9] eps = 1e-6 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.log(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.log_wrapper, [x], y, x_init=x_arr, place=place) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def _posteriori_network(self, input_mask, embed, batch_size, src_len, tgt_len): """ Basic posteriori network implement. """ mask_embed = self.mask_embed mask_embed = layers.expand(mask_embed, [batch_size, 1, 1]) mask_embed = self.embed_layer_norm(mask_embed) post_embed = layers.concat([mask_embed, embed], axis=1) mask = self._create_mask(input_mask, auto_regressive=not self.bidirectional_context, append_head=True) for layer in self.layers: post_embed = layer(post_embed, mask, None) post_embed = post_embed[:, 0] post_logits = self.post_network(post_embed) post_probs = layers.softmax(post_logits, axis=-1) post_logits = layers.log(post_probs) return post_embed, post_probs, post_logits
def pairwise_hinge(self): """pairwise model""" poi_repr = L.split(self.poi_repr, 2, dim=0) pos_repr, neg_repr = poi_repr pos_pred = L.cos_sim(self.query_repr, pos_repr) neg_pred = L.cos_sim(self.query_repr, neg_repr) mode = 'hinge_loss' # log(1 + e-z), max(0, 1 - z) if 'hinge_loss' == mode: theta_z = L.relu(1 + neg_pred - pos_pred) elif 'logistic_loss' == mode: theta_z = L.log(1 + L.exp(neg_pred - pos_pred)) self.loss = L.reduce_mean(theta_z) pos_cnt = L.reduce_sum(L.cast(L.greater_than(pos_pred, neg_pred), dtype="float32")) neg_cnt = L.reduce_sum(L.cast(L.less_than(pos_pred, neg_pred), dtype="float32")) self.order = pos_cnt / (1e-5 + neg_cnt) self.metrics = [self.loss, self.order]
def _decode(self, state): """ Decoding one time stamp. """ # shape: [batch_size, 1, seq_len] mask = state["mask"] # shape: [batch_size, 1] pred_token = state["pred_token"] pred_mask = state["pred_mask"] pred_pos = state["pred_pos"] pred_type = state["pred_type"] pred_turn = state["pred_turn"] # list of shape(len: num_layers): [batch_size, seq_len, hidden_dim] cache = state["cache"] pred_embed = self.embedder(pred_token, pred_pos, pred_type, pred_turn) pred_embed = self.embed_layer_norm(pred_embed) # shape: [batch_size, 1, seq_len + 1] mask = layers.concat([mask, 1 - pred_mask], axis=2) # shape: [batch_size, 1, hidden_dim] for l, layer in enumerate(self.layers): pred_embed = layer(pred_embed, mask, cache[f"layer_{l}"]) # shape: [batch_size, 1, vocab_size] if self.two_layer_predictor: pred_embed = self.pre_predictor(pred_embed) if self.weight_sharing: token_embedding = self.embedder.token_embedding.weight pred_logits = layers.matmul( x=pred_embed, y=token_embedding, transpose_y=True ) else: pred_logits = self.predictor(pred_embed) pred_logits = pred_logits[: , 0] pred_probs = layers.softmax(pred_logits, axis=1) pred_logits = layers.log(pred_probs) state["mask"] = mask return pred_logits, state
def grow_topk(self, i, logits, alive_seq, alive_log_probs, cache, enc_output, enc_bias): """ grow_topk """ logits = layers.reshape(logits, [self.batch_size, self.beam_size, -1]) candidate_log_probs = layers.log(layers.softmax(logits, axis=2)) log_probs = candidate_log_probs + layers.unsqueeze(alive_log_probs, axes=[2]) base_1 = layers.cast(i, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, self.alpha) #length_penalty = layers.pow(((5.0 + layers.cast(i+1, 'float32')) / 6.0), self.alpha) curr_scores = log_probs / length_penalty flat_curr_scores = layers.reshape(curr_scores, [self.batch_size, self.beam_size * self.vocab_size]) topk_scores, topk_ids = layers.topk(flat_curr_scores, k=self.beam_size * 2) topk_log_probs = topk_scores * length_penalty select_beam_index = topk_ids // self.vocab_size select_id = topk_ids % self.vocab_size #layers.Print(select_id, message="select_id", summarize=1024) #layers.Print(topk_scores, message="topk_scores", summarize=10000000) flat_select_beam_index = layers.reshape(select_beam_index, [-1]) + self.gather_top2k_append_index topk_seq = layers.gather(alive_seq, [flat_select_beam_index]) topk_seq = layers.reshape(topk_seq, [self.batch_size, 2 * self.beam_size, -1]) #concat with current ids topk_seq = layers.concat([topk_seq, layers.unsqueeze(select_id, axes=[2])], axis=2) topk_finished = layers.cast(layers.equal(select_id, self.eos_id), 'float32') #gather cache self.gather_cache(cache, flat_select_beam_index) #topk_seq: [batch_size, 2*beam_size, i+1] #topk_log_probs, topk_scores, topk_finished: [batch_size, 2*beam_size] return topk_seq, topk_log_probs, topk_scores, topk_finished, cache
def loss_neg_log_of_pos(self, pos_score, neg_score_n, gama=5.0): """ pos_score: batch_size x 1 neg_score_n: batch_size x n """ # n x batch_size neg_score_n = L.transpose(neg_score_n, [1, 0]) # 1 x batch_size pos_score = L.reshape(pos_score, [1, -1]) exp_pos_score = L.exp(pos_score * gama) exp_neg_score_n = L.exp(neg_score_n * gama) # (n+1) x batch_size pos_neg_score = L.concat([exp_pos_score, exp_neg_score_n], axis=0) # 1 x batch_size exp_sum = L.reduce_sum(pos_neg_score, dim=0, keep_dim=True) # 1 x batch_size loss = -1.0 * L.log(exp_pos_score / exp_sum) # batch_size loss = L.reshape(loss, [-1, 1]) return loss