def build_graph_attn_bias(input_mask, n_head, dtype, slot_seqlen): input_shape = L.shape(input_mask) input_batch = input_shape[0] input_seqlen = input_shape[1] num_slot = input_seqlen / slot_seqlen num_b = num_slot - 1 ones = L.ones([num_b], dtype="float32") # [num_b] diag_ones = L.diag(ones) # [num_b, num_b] diag_ones = L.unsqueeze(diag_ones, [1, -1]) # [num_b, 1, num_b, 1] diag_ones = L.expand( diag_ones, [1, slot_seqlen, 1, slot_seqlen]) # [num_b, seqlen, num_b, seqlen] diag_ones = L.reshape(diag_ones, [1, num_b * slot_seqlen, num_b * slot_seqlen ]) # [1, num_b*seqlen, num_b*seqlen] graph_attn_bias = L.concat([ L.ones([1, num_b * slot_seqlen, slot_seqlen], dtype="float32"), diag_ones ], 2) graph_attn_bias = L.concat([ L.ones([1, slot_seqlen, num_slot * slot_seqlen], dtype="float32"), graph_attn_bias ], 1) # [1, seq, seq] pad_attn_bias = L.matmul(input_mask, input_mask, transpose_y=True) # [batch, seq, seq] attn_bias = graph_attn_bias * pad_attn_bias attn_bias = (1. - attn_bias) * -10000. attn_bias = L.stack([attn_bias] * n_head, 1) # [batch, n_head, seq, seq] if attn_bias.dtype != dtype: attn_bias = L.cast(attn_bias, dtype) return attn_bias
def greedy_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, tgt_type_id=3): model.eval() _, logits, info = model(q_ids, q_sids) gen_ids = L.argmax(logits, -1) d_batch, d_seqlen = q_ids.shape seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) has_stopped = np.zeros([d_batch], dtype=np.bool) gen_seq_len = np.zeros([d_batch], dtype=np.int64) output_ids = [] past_cache = info['caches'] cls_ids = L.ones([d_batch], dtype='int64') * sos_id attn_ids = L.ones([d_batch], dtype='int64') * attn_id ids = L.stack([cls_ids, attn_ids], -1) for step in range(max_decode_len): bias = gen_bias(q_ids, ids, step) pos_ids = D.to_variable( np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch, 1])) pos_ids += seqlen _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) gen_ids = L.argmax(logits, -1) past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] cached_k = [ L.concat([pk, k[:, :1, :]], 1) for pk, k in zip(past_cached_k, cached_k) ] # concat cached cached_v = [ L.concat([pv, v[:, :1, :]], 1) for pv, v in zip(past_cached_v, cached_v) ] past_cache = (cached_k, cached_v) gen_ids = gen_ids[:, 1] ids = L.stack([gen_ids, attn_ids], 1) gen_ids = gen_ids.numpy() has_stopped |= (gen_ids == eos_id).astype(np.bool) gen_seq_len += (1 - has_stopped.astype(np.int64)) output_ids.append(gen_ids.tolist()) if has_stopped.all(): break output_ids = np.array(output_ids).transpose([1, 0]) return output_ids
def simple_net(self): d0 = layers.data( "d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data( "d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data( "d2", shape=[10], append_batch_size=False, dtype='float32') # fill_constant npu op doesn't support int64 i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int32') i = layers.cast(i, 'int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int32', value=5) array_len = layers.cast(array_len, 'int64') array_len.stop_gradient = True cond = layers.ones(shape=[1], dtype='int32') cond = layers.cast(cond, 'bool') j = layers.fill_constant(shape=[1], dtype='int32', value=1) j = layers.cast(j, 'int64') j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3) array_len2 = layers.cast(array_len2, 'int64') array_len2.stop_gradient = True cond2 = layers.logical_or(x=j, y=array_len2) cond2 = layers.ones(shape=[1], dtype='int32') cond2 = layers.cast(cond2, 'bool') while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) loss = layers.mean(sum_result) return loss, sum_result
def forward(self, q, k, v, lengths, speaker_embed, start_index, force_monotonic=False, prev_coeffs=None, window=None): # add position encoding as an inductive bias if self.has_bias: # multi-speaker model omega_q = 2 * F.sigmoid( F.squeeze(self.q_pos_affine(speaker_embed), axes=[-1])) omega_k = 2 * self.omega_initial * F.sigmoid(F.squeeze( self.k_pos_affine(speaker_embed), axes=[-1])) else: # single-speaker case batch_size = q.shape[0] omega_q = F.ones((batch_size, ), dtype="float32") omega_k = F.ones((batch_size, ), dtype="float32") * self.omega_default q += self.position_encoding_weight * positional_encoding(q, start_index, omega_q) k += self.position_encoding_weight * positional_encoding(k, 0, omega_k) q, k, v = self.q_affine(q), self.k_affine(k), self.v_affine(v) activations = F.matmul(q, k, transpose_y=True) activations /= np.sqrt(self.attention_dim) if self.training: # mask the <pad> parts from the encoder mask = F.sequence_mask(lengths, dtype="float32") attn_bias = F.scale(1. - mask, -1000) activations += F.unsqueeze(attn_bias, [1]) elif force_monotonic: assert window is not None backward_step, forward_step = window T_enc = k.shape[1] batch_size, T_dec, _ = q.shape # actually T_dec = 1 here alpha = F.fill_constant((batch_size, T_dec), value=0, dtype="int64") \ if prev_coeffs is None \ else F.argmax(prev_coeffs, axis=-1) backward = F.sequence_mask(alpha - backward_step, maxlen=T_enc, dtype="bool") forward = F.sequence_mask(alpha + forward_step, maxlen=T_enc, dtype="bool") mask = F.cast(F.logical_xor(backward, forward), "float32") # print("mask's shape:", mask.shape) attn_bias = F.scale(1. - mask, -1000) activations += attn_bias # softmax coefficients = F.softmax(activations, axis=-1) # context vector coefficients = F.dropout(coefficients, 1. - self.keep_prob, dropout_implementation='upscale_in_train') contexts = F.matmul(coefficients, v) # context normalization enc_lengths = F.cast(F.unsqueeze(lengths, axes=[1, 2]), "float32") contexts *= F.sqrt(enc_lengths) # out affine contexts = self.out_affine(contexts) return contexts, coefficients
def forward(self, x, mask_in=None): assert len(x.shape) == 4 if mask_in is not None or self.last_size != tuple(x.shape): self.last_size = tuple(x.shape) with dg.no_grad(): if self.weight_maskUpdater.dtype != x.dtype: self.weight_maskUpdater = self.weight_maskUpdater.astype( x.dtype) if mask_in is None: # If mask is not provided, create a mask. if self.multi_channel: mask = L.ones(x.shape, dtype=x.dtype) else: mask = L.ones((1, 1, x.shape[2], x.shape[3]), dtype=x.dtype) else: mask = mask_in self.update_mask = nn.functional.conv2d( mask, self.weight_maskUpdater, bias=None, stride=self.stride, padding=self.padding, dilation=self.dilation, groups=1) # For mixed precision training, eps from 1e-8 ~ 1e-6 eps = 1e-6 self.mask_ratio = self.slide_winsize / (self.update_mask + eps) self.update_mask = L.clamp(self.update_mask, 0, 1) self.mask_ratio = self.mask_ratio * self.update_mask raw_out = super(PartialConv2D, self).forward(x * mask if mask_in is not None else x) if self.bias is not None: bias_view = L.reshape(self.bias, (1, self.out_channels, 1, 1)) output = (raw_out - bias_view) * self.mask_ratio + bias_view output = output * self.update_mask else: output = raw_out * self.mask_ratio if self.return_mask: return output, self.update_mask else: return output
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" _, vocab_size = logits.shape bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] #log.debug(gather_idx.numpy()) #log.debug(state.finished.numpy()) #log.debug(next_finished.numpy()) next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') #log.debug(next_word_id.numpy()) #log.debug(next_beam_id.numpy()) next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def beam_search_step(state, logits, eos_id, beam_width, is_first_step, length_penalty): """logits.shape == [B*W, V]""" beam_size, vocab_size = logits.shape # as batch size=1 in this hub module. the first dim means bsz * beam_size equals beam_size logits_np = logits.numpy() for i in range(beam_size): logits_np[i][17963] = 0 # make [UNK] prob = 0 logits = D.to_variable(logits_np) bsz, beam_width = state.log_probs.shape onehot_eos = L.cast(F.one_hot(L.ones([1], 'int64') * eos_id, vocab_size), 'int64') #[1, V] probs = L.log(L.softmax(logits)) #[B*W, V] probs = mask_prob(probs, onehot_eos, state.finished) #[B*W, V] allprobs = L.reshape(state.log_probs, [-1, 1]) + probs #[B*W, V] not_finished = 1 - L.reshape(state.finished, [-1, 1]) #[B*W,1] not_eos = 1 - onehot_eos length_to_add = not_finished * not_eos #[B*W,V] alllen = L.reshape(state.lengths, [-1, 1]) + length_to_add allprobs = L.reshape(allprobs, [-1, beam_width * vocab_size]) alllen = L.reshape(alllen, [-1, beam_width * vocab_size]) allscore = hyp_score(allprobs, alllen, length_penalty) if is_first_step: allscore = L.reshape( allscore, [bsz, beam_width, -1])[:, 0, :] # first step only consiter beam 0 scores, idx = L.topk(allscore, k=beam_width) #[B, W] next_beam_id = idx // vocab_size #[B, W] next_word_id = idx % vocab_size gather_idx = L.concat([L.where(idx != -1)[:, :1], L.reshape(idx, [-1, 1])], 1) next_probs = L.reshape(L.gather_nd(allprobs, gather_idx), idx.shape) next_len = L.reshape(L.gather_nd(alllen, gather_idx), idx.shape) gather_idx = L.concat( [L.where(next_beam_id != -1)[:, :1], L.reshape(next_beam_id, [-1, 1])], 1) next_finished = L.reshape( L.gather_nd(state.finished, gather_idx), state.finished.shape ) #[gather new beam state according to new beam id] next_finished += L.cast(next_word_id == eos_id, 'int64') next_finished = L.cast(next_finished > 0, 'int64') next_state = BeamSearchState(log_probs=next_probs, lengths=next_len, finished=next_finished) output = BeamSearchOutput(scores=scores, predicted_ids=next_word_id, beam_parent_ids=next_beam_id) return output, next_state
def __init__(self, *args, multi_channel=False, return_mask=True, **kwargs): # whether the mask is multi-channel or not self.multi_channel = multi_channel self.return_mask = return_mask super(PartialConv2D, self).__init__(*args, **kwargs) if self.multi_channel: self.weight_maskUpdater = L.ones( (self.out_channels, self.in_channels, self.kernel_size[0], self.kernel_size[1])) else: self.weight_maskUpdater = L.ones( (1, 1, self.kernel_size[0], self.kernel_size[1])) shape = self.weight_maskUpdater.shape self.slide_winsize = shape[1] * shape[2] * shape[3] self.last_size = (None, None, None, None) self.update_mask = None self.mask_ratio = None self.partial_conv = True
def _build_sentence_ids(self, src_ids): src_shape = L.shape(src_ids) src_seqlen = src_shape[1] src_batch = src_shape[0] slot_seqlen = self.slot_seqlen zeros = L.zeros([src_batch, slot_seqlen], "int64") ones = L.ones([src_batch, src_seqlen - slot_seqlen], "int64") sentence_ids = L.concat([zeros, ones], 1) sentence_ids.stop_gradient = True return sentence_ids
def partial_trace(rho_AB, dim1, dim2, A_or_B): r"""计算量子态的偏迹。 Args: rho_AB (ComplexVariable): 输入的量子态 dim1 (int): 系统A的维数 dim2 (int): 系统B的维数 A_or_B (int): 1或者2,1表示去除A,2表示去除B Returns: ComplexVariable: 量子态的偏迹 """ if A_or_B == 2: dim1, dim2 = dim2, dim1 idty_np = identity(dim2).astype("complex128") idty_B = to_variable(idty_np) zero_np = np_zeros([dim2, dim2], "complex128") res = to_variable(zero_np) for dim_j in range(dim1): row_top = pp_zeros([1, dim_j], dtype="float64") row_mid = ones([1, 1], dtype="float64") row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float64") bra_j_re = concat([row_top, row_mid, row_bot], axis=1) bra_j_im = pp_zeros([1, dim1], dtype="float64") bra_j = ComplexVariable(bra_j_re, bra_j_im) if A_or_B == 1: row_tmp = pp_kron(bra_j, idty_B) res = elementwise_add( res, matmul( matmul(row_tmp, rho_AB), pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ), ) if A_or_B == 2: row_tmp = pp_kron(idty_B, bra_j) res = elementwise_add( res, matmul( matmul(row_tmp, rho_AB), pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ), ) return res
def compute_neuron_head_importance(args, model, dev_ds, place, model_cfg): n_layers, n_heads = model_cfg['num_hidden_layers'], model_cfg[ 'num_attention_heads'] head_importance = L.zeros(shape=[n_layers, n_heads], dtype='float32') head_mask = L.ones(shape=[n_layers, n_heads], dtype='float32') head_mask.stop_gradient = False intermediate_weight = [] intermediate_bias = [] output_weight = [] for name, w in model.named_parameters(): if 'ffn.i' in name: if len(w.shape) > 1: intermediate_weight.append(w) else: intermediate_bias.append(w) if 'ffn.o' in name: if len(w.shape) > 1: output_weight.append(w) neuron_importance = [] for w in intermediate_weight: neuron_importance.append(np.zeros(shape=[w.shape[1]], dtype='float32')) eval_task_names = ('mnli', 'mnli-mm') if args.task == 'mnli' else (args.task, ) for eval_task in eval_task_names: for batch in dev_ds.start(place): ids, sids, label = batch out = model(ids, sids, labels=label, head_mask=head_mask, num_layers=model_cfg['num_hidden_layers']) loss = out[0] loss.backward() head_importance += L.abs(FD.to_variable(head_mask.gradient())) for w1, b1, w2, current_importance in zip(intermediate_weight, intermediate_bias, output_weight, neuron_importance): current_importance += np.abs( (np.sum(w1.numpy() * w1.gradient(), axis=0) + b1.numpy() * b1.gradient())) current_importance += np.abs( np.sum(w2.numpy() * w2.gradient(), axis=1)) return head_importance, neuron_importance
def gen_bias(encoder_inputs, decoder_inputs, step): decoder_bsz, decoder_seqlen = decoder_inputs.shape[:2] attn_bias = L.reshape(L.range(0, decoder_seqlen, 1, dtype='float32') + 1, [1, -1, 1]) decoder_bias = L.cast((L.matmul(attn_bias, 1. / attn_bias, transpose_y=True) >= 1.), 'float32') #[1, 1, decoderlen, decoderlen] encoder_bias = L.unsqueeze(L.cast(L.ones_like(encoder_inputs), 'float32'), [1]) #[bsz, 1, encoderlen] encoder_bias = L.expand(encoder_bias, [1, decoder_seqlen, 1]) #[bsz,decoderlen, encoderlen] decoder_bias = L.expand(decoder_bias, [decoder_bsz, 1, 1]) #[bsz, decoderlen, decoderlen] if step > 0: bias = L.concat([encoder_bias, L.ones([decoder_bsz, decoder_seqlen, step], 'float32'), decoder_bias], -1) else: bias = L.concat([encoder_bias, decoder_bias], -1) return bias
def partial_trace(rho_AB, dim1, dim2, A_or_B): r"""求AB复合系统下的偏迹 Args: rho_AB (Variable): AB复合系统的密度矩阵 dim1 (int): A系统的维度 dim2 (int): B系统的维度 A_orB (int): 1表示求系统A,2表示求系统B Returns: ComplexVariable: 求得的偏迹 """ # dim_total = dim1 * dim2 if A_or_B == 2: dim1, dim2 = dim2, dim1 idty_np = identity(dim2).astype("complex64") idty_B = to_variable(idty_np) zero_np = np_zeros([dim2, dim2], "complex64") res = to_variable(zero_np) for dim_j in range(dim1): row_top = pp_zeros([1, dim_j], dtype="float32") row_mid = ones([1, 1], dtype="float32") row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float32") bra_j_re = concat([row_top, row_mid, row_bot], axis=1) bra_j_im = pp_zeros([1, dim1], dtype="float32") bra_j = ComplexVariable(bra_j_re, bra_j_im) if A_or_B == 1: row_tmp = pp_kron(bra_j, idty_B) res = elementwise_add( res, matmul( matmul(row_tmp, rho_AB), pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ), ) if A_or_B == 2: row_tmp = pp_kron(idty_B, bra_j) res += matmul( matmul(row_tmp, rho_AB), pp_transpose(ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ) return res
def get_dec_attn_key_pad_mask(seq_k, num_head, dtype): ''' For masking out the padding part of key sequence. ''' # Expand to fit the shape of key query attention matrix. padding_mask = layers.cast(seq_k == 0, dtype=dtype) padding_mask = layers.unsqueeze(padding_mask, axes=[1]) len_k = seq_k.shape[1] triu = layers.triu(layers.ones(shape=[len_k, len_k], dtype=dtype), diagonal=1) padding_mask = padding_mask + triu padding_mask = layers.cast(padding_mask != 0, dtype=dtype) * -1e30 #* (-2**32 + 1) padding_mask = layers.expand(padding_mask, [num_head, 1, 1]) return padding_mask
def gradient_penalty(x, y, f): # interpolation shape = [x.shape[0]] + [1] * (x.dim() - 1) alpha = layers.rand(shape) z = x + alpha * (y - x) # gradient penalty z = dygraph.to_variable(z) o = f(z) g = dygraph.grad(o, z, grad_outputs=layers.ones(o.size()), create_graph=True)[0].view(z.size(0), -1) gp = ((g.norm(p=2, dim=1) - 1)**2).mean() return gp
def __init__(self, x, y, y_aux, cfg): self.program = fluid.default_main_program().clone() with fluid.program_guard(self.program): model = ACGAN(cfg.latent_size, cfg.num_classes) self.fake, self.aux = model.network_d(x, name='d') self.fake_loss = layers.sigmoid_cross_entropy_with_logits( x=self.fake, label=y) self.aux_loss = layers.softmax_with_cross_entropy(logits=self.aux, label=y_aux) self.unweighted_loss = layers.reduce_sum(self.fake_loss + self.aux_loss) self.infer_program = self.program.clone(for_test=True) # we don't want the discriminator to also maximize the classification # accuracy of the auxiliary classifier on generated images, so we # don't train discriminator to produce class labels for generated # images (see https://openreview.net/forum?id=rJXTf9Bxg). # To preserve sum of sample weights for the auxiliary classifier, # we assign sample weight of 2 to the real images. fake_loss_weight = layers.ones(shape=[cfg.batch_size * 2, 1], dtype='float32') aux_loss_weight_zeros = layers.zeros(shape=[cfg.batch_size, 1], dtype='float32') aux_loss_weight_twos = layers.fill_constant( shape=[cfg.batch_size, 1], value=2.0, dtype='float32') aux_loss_weight = layers.concat( [aux_loss_weight_twos, aux_loss_weight_zeros]) self.fake_loss = layers.elementwise_mul(self.fake_loss, fake_loss_weight) self.aux_loss = layers.elementwise_mul(self.aux_loss, aux_loss_weight) self.loss = layers.reduce_sum(self.fake_loss) + layers.reduce_sum( self.aux_loss) vars = [] for var in self.program.list_vars(): if fluid.io.is_parameter(var) and (var.name.startswith("d")): vars.append(var.name) optimizer = fluid.optimizer.Adam(learning_rate=cfg.adam_lr, beta1=cfg.adam_beta_1, name="net_d") optimizer.minimize(self.loss, parameter_list=vars)
def partial_trace(rho_AB, dim1, dim2, A_or_B): """ :param rho_AB: the input density matrix :param dim1: dimension for system A :param dim2: dimension for system B :param A_or_B: 1 or 2, choose the system that you want trace out. :return: partial trace """ # dim_total = dim1 * dim2 if A_or_B == 2: dim1, dim2 = dim2, dim1 idty_np = identity(dim2).astype("complex64") idty_B = to_variable(idty_np) zero_np = np_zeros([dim2, dim2], "complex64") res = to_variable(zero_np) for dim_j in range(dim1): row_top = pp_zeros([1, dim_j], dtype="float32") row_mid = ones([1, 1], dtype="float32") row_bot = pp_zeros([1, dim1 - dim_j - 1], dtype="float32") bra_j_re = concat([row_top, row_mid, row_bot], axis=1) bra_j_im = pp_zeros([1, dim1], dtype="float32") bra_j = ComplexVariable(bra_j_re, bra_j_im) if A_or_B == 1: row_tmp = pp_kron(bra_j, idty_B) res = elementwise_add( res, matmul( matmul(row_tmp, rho_AB), pp_transpose( ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ), ) if A_or_B == 2: row_tmp = pp_kron(idty_B, bra_j) res += matmul( matmul(row_tmp, rho_AB), pp_transpose( ComplexVariable(row_tmp.real, -row_tmp.imag), perm=[1, 0]), ) return res
def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): """ Create the criterion. Parameters: num_classes: number of object categories, omitting the special on-object category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. eos_coef: relative classification weight applied to the no-object category losses: list of all the losses to be applied. See get_loss for list of available losses. """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.eos_coef = eos_coef self.losses = losses self.eos_coef = eos_coef empty_weight = L.ones([self.num_classes + 1], dtype="float32") empty_weight[-1] = self.eos_coef self.empty_weight = empty_weight
def topk_pool(gw, score, graph_id, ratio): """Implementation of topk pooling, where k means pooling ratio. Args: gw: Graph wrapper object. score: The attention score of all nodes, which is used to select important nodes. graph_id: The graphs that the nodes belong to. ratio: The pooling ratio of nodes we want to select. Return: perm: The index of nodes we choose. ratio_length: The selected node numbers of each graph. """ graph_lod = gw.graph_lod graph_nodes = gw.num_nodes num_graph = gw.num_graph num_nodes = L.ones(shape=[graph_nodes], dtype="float32") num_nodes = L.lod_reset(num_nodes, graph_lod) num_nodes_per_graph = L.sequence_pool(num_nodes, pool_type='sum') max_num_nodes = L.reduce_max(num_nodes_per_graph, dim=0) max_num_nodes = L.cast(max_num_nodes, dtype="int32") index = L.arange(0, gw.num_nodes, dtype="int64") offset = L.gather(graph_lod, graph_id, overwrite=False) index = (index - offset) + (graph_id * max_num_nodes) index.stop_gradient = True # padding dense_score = L.fill_constant(shape=[num_graph * max_num_nodes], dtype="float32", value=-999999) index = L.reshape(index, shape=[-1]) dense_score = L.scatter(dense_score, index, updates=score) num_graph = L.cast(num_graph, dtype="int32") dense_score = L.reshape(dense_score, shape=[num_graph, max_num_nodes]) # record the sorted index _, sort_index = L.argsort(dense_score, axis=-1, descending=True) # recover the index range graph_lod = graph_lod[:-1] graph_lod = L.reshape(graph_lod, shape=[-1, 1]) graph_lod = L.cast(graph_lod, dtype="int64") sort_index = L.elementwise_add(sort_index, graph_lod, axis=-1) sort_index = L.reshape(sort_index, shape=[-1, 1]) # use sequence_slice to choose selected node index pad_lod = L.arange(0, (num_graph + 1) * max_num_nodes, step=max_num_nodes, dtype="int32") sort_index = L.lod_reset(sort_index, pad_lod) ratio_length = L.ceil(num_nodes_per_graph * ratio) ratio_length = L.cast(ratio_length, dtype="int64") ratio_length = L.reshape(ratio_length, shape=[-1, 1]) offset = L.zeros(shape=[num_graph, 1], dtype="int64") choose_index = L.sequence_slice(input=sort_index, offset=offset, length=ratio_length) perm = L.reshape(choose_index, shape=[-1]) return perm, ratio_length
def __call__(self, kernel_preds, cls_preds, mask_protos, batch_gt_objs_tensors, batch_gt_clss_tensors, batch_gt_masks_tensors, batch_gt_pos_idx_tensors): ''' :param kernel_preds: kernel_preds里每个元素形状是[N, 256, seg_num_grid, seg_num_grid], 每个格子的预测卷积核。 从 小感受野 到 大感受野。 :param cls_preds: cls_preds里每个元素形状是 [N, 80, seg_num_grid, seg_num_grid], 每个格子的预测概率,未进行sigmoid()激活。 从 小感受野 到 大感受野。 :param mask_protos: [bs, 256, s4, s4] 掩码原型 :param batch_gt_objs_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 1], 每个格子的objness。 从 小感受野 到 大感受野。 :param batch_gt_clss_tensors: 里每个元素形状是[N, seg_num_grid, seg_num_grid, 80], 每个格子真实类别onehot。 从 小感受野 到 大感受野。 :param batch_gt_masks_tensors: 里每个元素形状是[N, -1, s4, s4], 真实掩码。 从 小感受野 到 大感受野。 :param batch_gt_pos_idx_tensors: 里每个元素形状是[N, -1, 3], 正样本的下标。 从 小感受野 到 大感受野。 :return: ''' batch_size = self.batch_size num_layers = len(kernel_preds) # ================= 计算损失 ================= num_ins = 0. # 记录这一批图片的正样本个数 loss_clss, loss_masks = [], [] for bid in range(batch_size): for lid in range(num_layers): # ================ 掩码损失 ====================== mask_proto = mask_protos[bid] # [256, s4, s4] 这张图片产生的掩码原型。 kernel_pred = kernel_preds[lid][ bid] # [256, seg_num_grid, seg_num_grid] 格子预测的卷积核(yolact中的“掩码系数”) kernel_pred = L.transpose( kernel_pred, perm=[1, 2, 0] ) # [seg_num_grid, seg_num_grid, 256] 格子预测的卷积核(yolact中的“掩码系数”) gt_objs = batch_gt_objs_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 1] gt_masks = batch_gt_masks_tensors[lid][bid] # [-1, s4, s4] pmidx = batch_gt_pos_idx_tensors[lid][bid] # [-1, 3] gt_objs.stop_gradient = True gt_masks.stop_gradient = True pmidx.stop_gradient = True idx_sum = L.reduce_sum(pmidx, dim=1) keep = L.where(idx_sum > -1) keep = L.reshape(keep, (-1, )) keep.stop_gradient = True pmidx = L.gather(pmidx, keep) # [M, 3] yx_idx = pmidx[:, :2] # [M, 2] m_idx = pmidx[:, 2] # [M, ] yx_idx.stop_gradient = True m_idx.stop_gradient = True # 抽出来 gt_obj = L.gather_nd(gt_objs, yx_idx) # [M, 1] 是否是真正的正样本。 pos_krn = L.gather_nd(kernel_pred, yx_idx) # [M, 256] 正样本的卷积核(掩码系数)。 gt_mask = L.gather(gt_masks, m_idx) # [M, s4, s4] 真实掩码。 # 正样本数量 num_ins += L.reduce_sum(gt_obj) # 生成预测掩码 mask_proto = L.transpose(mask_proto, perm=[1, 2, 0]) # [s4, s4, 256] masks = L.matmul(mask_proto, pos_krn, transpose_y=True) # [s4, s4, M] masks = L.sigmoid(masks) # [s4, s4, M] masks = L.transpose(masks, perm=[2, 0, 1]) # [M, s4, s4] loss_mask = self.dice_loss(masks, gt_mask, gt_obj) loss_masks.append(loss_mask) # ================ 分类损失。sigmoid_focal_loss() ====================== gamma = self.loss_gamma alpha = self.loss_alpha pred_conf = cls_preds[lid][ bid] # [80, seg_num_grid, seg_num_grid] 未进行sigmoid()激活。 pred_conf = L.transpose(pred_conf, perm=[ 1, 2, 0 ]) # [seg_num_grid, seg_num_grid, 80] 未进行sigmoid()激活。 pred_conf = L.sigmoid( pred_conf ) # [seg_num_grid, seg_num_grid, 80] 已进行sigmoid()激活。 gt_clss = batch_gt_clss_tensors[lid][ bid] # [seg_num_grid, seg_num_grid, 80] 真实类别onehot gt_clss.stop_gradient = True pos_loss = gt_clss * (0 - L.log(pred_conf + 1e-9)) * L.pow( 1 - pred_conf, gamma) * alpha neg_loss = ( 1.0 - gt_clss) * (0 - L.log(1 - pred_conf + 1e-9)) * L.pow( pred_conf, gamma) * (1 - alpha) focal_loss = pos_loss + neg_loss focal_loss = L.reduce_sum(focal_loss, dim=[0, 1]) loss_clss.append(focal_loss) loss_masks = L.concat(loss_masks, axis=0) loss_masks = L.reduce_sum(loss_masks) * self.ins_loss_weight loss_masks = loss_masks / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_clss = L.concat(loss_clss, axis=0) loss_clss = L.reduce_sum(loss_clss) * self.clss_loss_weight loss_clss = loss_clss / L.elementwise_max( L.ones((1, ), dtype='float32'), num_ins) loss_all = {"loss_masks": loss_masks, "loss_clss": loss_clss} return loss_all
def beam_search_infilling(model, q_ids, q_sids, sos_id, eos_id, attn_id, max_encode_len=640, max_decode_len=100, beam_width=5, tgt_type_id=3, length_penalty=1.0): model.eval() _, __, info = model(q_ids, q_sids) d_batch, d_seqlen = q_ids.shape state = BeamSearchState(log_probs=L.zeros([d_batch, beam_width], 'float32'), lengths=L.zeros([d_batch, beam_width], 'int64'), finished=L.zeros([d_batch, beam_width], 'int64')) outputs = [] def reorder_(t, parent_id): """reorder cache according to parent beam id""" gather_idx = L.where(parent_id != -1)[:, 0] * beam_width + L.reshape( parent_id, [-1]) t = L.gather(t, gather_idx) return t def tile_(t, times): _shapes = list(t.shape[1:]) ret = L.reshape( L.expand(L.unsqueeze(t, [1]), [ 1, times, ] + [ 1, ] * len(_shapes)), [ -1, ] + _shapes) return ret cached_k, cached_v = info['caches'] cached_k = [tile_(k, beam_width) for k in cached_k] cached_v = [tile_(v, beam_width) for v in cached_v] past_cache = (cached_k, cached_v) q_ids = tile_(q_ids, beam_width) seqlen = L.reduce_sum(L.cast(q_ids != 0, 'int64'), 1, keep_dim=True) cls_ids = L.ones([d_batch * beam_width], dtype='int64') * sos_id attn_ids = L.ones([d_batch * beam_width], dtype='int64') * attn_id # SOS ids = L.stack([cls_ids, attn_ids], -1) for step in range(max_decode_len): bias = gen_bias(q_ids, ids, step) pos_ids = D.to_variable( np.tile(np.array([[step, step + 1]], dtype=np.int64), [d_batch * beam_width, 1])) pos_ids += seqlen _, logits, info = model(ids, L.ones_like(ids) * tgt_type_id, pos_ids=pos_ids, attn_bias=bias, past_cache=past_cache) output, state = beam_search_step(state, logits[:, 1], eos_id=eos_id, beam_width=beam_width, is_first_step=(step == 0), length_penalty=length_penalty) outputs.append(output) past_cached_k, past_cached_v = past_cache cached_k, cached_v = info['caches'] cached_k = [ reorder_(L.concat([pk, k[:, :1, :]], 1), output.beam_parent_ids) for pk, k in zip(past_cached_k, cached_k) ] # concat cached cached_v = [ reorder_(L.concat([pv, v[:, :1, :]], 1), output.beam_parent_ids) for pv, v in zip(past_cached_v, cached_v) ] past_cache = (cached_k, cached_v) pred_ids_flatten = L.reshape(output.predicted_ids, [d_batch * beam_width]) ids = L.stack([pred_ids_flatten, attn_ids], 1) if state.finished.numpy().all(): break final_ids = L.stack([o.predicted_ids for o in outputs], 0) final_parent_ids = L.stack([o.beam_parent_ids for o in outputs], 0) final_ids = L.gather_tree(final_ids, final_parent_ids)[:, :, 0] # pick best beam final_ids = L.transpose(L.reshape(final_ids, [-1, d_batch * 1]), [1, 0]) return final_ids
def _init_state(self, inputs): """ Initialize decode state. """ state = {} src_token = inputs["src_token"] src_mask = inputs["src_mask"] src_pos = inputs["src_pos"] src_type = inputs["src_type"] src_turn = inputs["src_turn"] batch_size = src_token.shape[0] seq_len = src_token.shape[1] src_embed = self.embedder(src_token, src_pos, src_type, src_turn) src_embed = self.embed_layer_norm(src_embed) mask = self._create_mask(src_mask, append_head=self.num_latent > 0) if self.num_latent > 0: src_embed = F.unsqueeze(src_embed, [1]) src_embed = layers.expand(src_embed, [1, self.num_latent, 1, 1]) src_embed = layers.reshape(src_embed, [-1, seq_len, self.hidden_dim]) latent_embed = self.latent_embeddings latent_embed = F.unsqueeze(latent_embed, [1]) latent_embed = layers.expand(latent_embed, [batch_size, 1, 1]) latent_embed = self.embed_layer_norm(latent_embed) enc_out = layers.concat([latent_embed, src_embed], axis=1) mask = F.unsqueeze(mask, [1]) mask = layers.expand(mask, [1, self.num_latent, 1, 1]) mask = layers.reshape(mask, [-1, seq_len + 1, seq_len + 1]) else: enc_out = src_embed cache = {} for l, layer in enumerate(self.layers): cache[f"layer_{l}"] = {} enc_out = layer(enc_out, mask, cache[f"layer_{l}"]) state["cache"] = cache state["mask"] = mask[:, :1] if self.num_latent > 0: state["batch_size"] = batch_size * self.num_latent shape = [batch_size * self.num_latent, 1, 1] else: state["batch_size"] = batch_size shape = [batch_size, 1, 1] state["pred_mask"] = layers.ones(shape, self._dtype) state["pred_pos"] = layers.zeros(shape, "int64") state["pred_type"] = layers.zeros(shape, "int64") state["pred_turn"] = layers.zeros(shape, "int64") if "tgt_token" in inputs and self.num_latent > 0: tgt_token = inputs["tgt_token"][:, :-1] tgt_mask = inputs["tgt_mask"][:, :-1] tgt_pos = inputs["tgt_pos"][:, :-1] tgt_type = inputs["tgt_type"][:, :-1] tgt_turn = inputs["tgt_turn"][:, :-1] input_mask = layers.concat([src_mask, tgt_mask], axis=1) input_mask.stop_gradient = True src_embed = self.embedder(src_token, src_pos, src_type, src_turn) tgt_embed = self.embedder(tgt_token, tgt_pos, tgt_type, tgt_turn) embed = layers.concat([src_embed, tgt_embed], axis=1) embed = self.embed_layer_norm(embed) batch_size = src_token.shape[0] src_len = src_token.shape[1] tgt_len = tgt_token.shape[1] post_embed, post_probs, post_logits = self._posteriori_network( input_mask, embed, batch_size, src_len, tgt_len) state["post_probs"] = post_probs return state