def _generate_anchors(self, feats=None): # just use in eval time anchor_points = [] stride_tensor = [] for i, stride in enumerate(self.fpn_stride): if feats is not None: _, _, h, w = feats[i].shape else: h = math.ceil(self.eval_size[0] / stride) w = math.ceil(self.eval_size[1] / stride) shift_x = paddle.arange(end=w) + self.cell_offset shift_y = paddle.arange(end=h) + self.cell_offset shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) anchor_point = paddle.cast( paddle.stack( [shift_x, shift_y], axis=-1), dtype='float32') anchor_points.append(anchor_point.reshape([-1, 2])) stride_tensor.append( paddle.full( [h * w, 1], stride, dtype='float32')) anchor_points = paddle.concat(anchor_points) stride_tensor = paddle.concat(stride_tensor) return anchor_points, stride_tensor
def pdpd_range(name : str, x, start, end, step, out_type): import paddle as pdpd pdpd.enable_static() with pdpd.static.program_guard(pdpd.static.Program(), pdpd.static.Program()): node_x = pdpd.static.data(name='x', shape=x.shape, dtype='float32') # Range op only support fill_constant input, since dynamic op is not supported in ov out = pdpd.fluid.layers.range(start, end, step, out_type) out = pdpd.cast(out, np.float32) out = pdpd.add(node_x, out) #out = pdpd.cast(out, np.float32) cpu = pdpd.static.cpu_places(1) exe = pdpd.static.Executor(cpu[0]) # startup program will call initializer to initialize the parameters. exe.run(pdpd.static.default_startup_program()) outs = exe.run( feed={'x': x}, fetch_list=[out]) saveModel(name, exe, feedkeys=['x'], fetchlist=[out], inputs=[x], outputs=[outs[0]], target_dir=sys.argv[1]) return outs[0]
def pdpd_scale_tensor(name : str, x, scale, bias, attrs : dict, data_type): import paddle as pdpd pdpd.enable_static() with pdpd.static.program_guard(pdpd.static.Program(), pdpd.static.Program()): node_x = pdpd.static.data(name='x', shape=x.shape, dtype=data_type) node_scale = pdpd.static.data(name='scale', shape=[1], dtype='float32') out = pdpd.scale(x=node_x, scale=node_scale, bias=bias, bias_after_scale=attrs['bias_after_scale']) #FuzzyTest only support FP32 now, so cast result to fp32 out = pdpd.cast(out, "float32") cpu = pdpd.static.cpu_places(1) exe = pdpd.static.Executor(cpu[0]) # startup program will call initializer to initialize the parameters. exe.run(pdpd.static.default_startup_program()) outs = exe.run( feed={'x': x, 'scale': scale}, fetch_list=[out]) saveModel(name, exe, feedkeys=['x', 'scale'], fetchlist=[out], inputs=[x, np.array([scale]).astype('float32')], outputs=[outs[0]], target_dir=sys.argv[1]) return outs[0]
def degree_norm(graph, mode="indegree", p=-1): """Calculate the degree normalization of a graph Args: graph: the graph object from (:code:`Graph`) mode: which degree to be normalized ("indegree" or "outdegree") return: A tensor with shape (num_nodes, 1). """ assert mode in [ 'indegree', 'outdegree' ], "The degree_norm mode should be in ['indegree', 'outdegree']. But recieve mode=%s" % mode if mode == "indegree": degree = graph.indegree() + 1 elif mode == "outdegree": degree = graph.outdegree() + 1 norm = paddle.cast(degree, dtype=paddle.get_default_dtype()) norm = paddle.clip(norm, min=1.0) norm = paddle.pow(norm, p) norm = paddle.reshape(norm, [-1, 1]) return norm
def rect2rbox(self, bboxes): """ :param bboxes: shape (n, 4) (xmin, ymin, xmax, ymax) :return: dbboxes: shape (n, 5) (x_ctr, y_ctr, w, h, angle) """ bboxes = paddle.reshape(bboxes, [-1, 4]) num_boxes = paddle.shape(bboxes)[0] x_ctr = (bboxes[:, 2] + bboxes[:, 0]) / 2.0 y_ctr = (bboxes[:, 3] + bboxes[:, 1]) / 2.0 edges1 = paddle.abs(bboxes[:, 2] - bboxes[:, 0]) edges2 = paddle.abs(bboxes[:, 3] - bboxes[:, 1]) rbox_w = paddle.maximum(edges1, edges2) rbox_h = paddle.minimum(edges1, edges2) # set angle inds = edges1 < edges2 inds = paddle.cast(inds, 'int32') rboxes_angle = inds * np.pi / 2.0 rboxes = paddle.stack((x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=1) return rboxes
def net(self, inputs, is_infer=False): self.hist_item_seq = inputs[0] self.hist_cat_seq = inputs[1] self.target_item = inputs[2] self.target_cat = inputs[3] self.label = inputs[4].reshape([-1, 1]) self.mask = inputs[5] self.target_item_seq = inputs[6] self.target_cat_seq = inputs[7] din_model = DINLayer(self.item_emb_size, self.cat_emb_size, self.act, self.is_sparse, self.use_DataLoader, self.item_count, self.cat_count) raw_predict = din_model.forward(self.hist_item_seq, self.hist_cat_seq, self.target_item, self.target_cat, self.label, self.mask, self.target_item_seq, self.target_cat_seq) avg_loss = paddle.nn.functional.binary_cross_entropy_with_logits( raw_predict, self.label, reduction='mean') self._cost = avg_loss self.predict = paddle.nn.functional.sigmoid(raw_predict) predict_2d = paddle.concat([1 - self.predict, self.predict], 1) label_int = paddle.cast(self.label, 'int64') auc, batch_auc, _ = paddle.static.auc(input=predict_2d, label=label_int, slide_steps=0) self.inference_target_var = auc if is_infer: fetch_dict = {'auc': auc} return fetch_dict fetch_dict = {'cost': avg_loss, 'auc': auc} return fetch_dict
def net(self, input, is_infer=False): self.sparse_inputs = self._sparse_data_var[1:] self.dense_input = self._dense_data_var[0] self.label_input = self._sparse_data_var[0] sparse_number = self.sparse_inputs_slot - 1 assert sparse_number == len(self.sparse_inputs) dcn_model = DeepCroLayer(self.sparse_feature_number, self.sparse_feature_dim, self.dense_input_dim, sparse_number, self.fc_sizes, self.cross_num, self.clip_by_norm, self.l2_reg_cross, self.is_sparse) print("----self.dense_input-----", self.dense_input) print("----self.sparse_inputs----", self.sparse_inputs) pred, l2_loss = dcn_model.forward(self.sparse_inputs, self.dense_input) #pred = F.sigmoid(prediction) predict_2d = paddle.concat(x=[1 - pred, pred], axis=1) auc, batch_auc_var, _ = paddle.fluid.layers.auc(input=predict_2d, label=self.label_input, slide_steps=0) self.inference_target_var = auc if is_infer: fetch_dict = {'auc': auc} return fetch_dict cost = paddle.nn.functional.log_loss(input=pred, label=paddle.cast( self.label_input, dtype="float32")) avg_cost = paddle.mean(x=cost) self._cost = avg_cost + l2_loss fetch_dict = {'cost': avg_cost, 'auc': auc} return fetch_dict
def greater_equal(name: str, x, y, data_type, cast_to_fp32=False): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): node_x = paddle.static.data(name='input_x', shape=x.shape, dtype=data_type) node_y = paddle.static.data(name='input_y', shape=y.shape, dtype=data_type) out = paddle.fluid.layers.greater_equal(x=node_x, y=node_y, name='greater_equal') # FuzzyTest framework doesn't support boolean so cast to fp32/int32 if cast_to_fp32: data_type = "float32" out = paddle.cast(out, data_type) cpu = paddle.static.cpu_places(1) exe = paddle.static.Executor(cpu[0]) # startup program will call initializer to initialize the parameters. exe.run(paddle.static.default_startup_program()) outs = exe.run(feed={'input_x': x, 'input_y': y}, fetch_list=[out]) saveModel(name, exe, feedkeys=['input_x', 'input_y'], fetchlist=[out], inputs=[x, y], outputs=[outs[0]], target_dir=sys.argv[1]) return outs[0]
def forward(self, predicts, batch): predict = predicts['predict'] word_predict = predicts['word_out'] gsrm_predict = predicts['gsrm_out'] label = batch[1] casted_label = paddle.cast(x=label, dtype='int64') casted_label = paddle.reshape(x=casted_label, shape=[-1, 1]) cost_word = self.loss_func(word_predict, label=casted_label) cost_gsrm = self.loss_func(gsrm_predict, label=casted_label) cost_vsfd = self.loss_func(predict, label=casted_label) cost_word = paddle.reshape(x=paddle.sum(cost_word), shape=[1]) cost_gsrm = paddle.reshape(x=paddle.sum(cost_gsrm), shape=[1]) cost_vsfd = paddle.reshape(x=paddle.sum(cost_vsfd), shape=[1]) sum_cost = cost_word * 3.0 + cost_vsfd + cost_gsrm * 0.15 return { 'loss': sum_cost, 'word_loss': cost_word, 'img_loss': cost_vsfd }
def net(self, input, is_infer=False): self.sparse_inputs = input[1:self.sparse_inputs_slot] self.dense_input = input[-1] self.label_input = input[0] sparse_number = self.sparse_inputs_slot - 1 assert sparse_number == len(self.sparse_inputs) xdeepfm_model = xDeepFMLayer(self.sparse_feature_number, self.sparse_feature_dim, self.dense_input_dim, sparse_number, self.layer_sizes_cin, self.layer_sizes_dnn) pred = xdeepfm_model.forward(self.sparse_inputs, self.dense_input) #pred = F.sigmoid(prediction) predict_2d = paddle.concat(x=[1 - pred, pred], axis=1) auc, batch_auc_var, _ = paddle.static.auc(input=predict_2d, label=self.label_input, slide_steps=0) self.inference_target_var = auc if is_infer: fetch_dict = {'auc': auc} return fetch_dict cost = paddle.nn.functional.log_loss(input=pred, label=paddle.cast( self.label_input, dtype="float32")) avg_cost = paddle.mean(x=cost) self._cost = avg_cost fetch_dict = {'cost': avg_cost, 'auc': auc} return fetch_dict
def forward(self, input, label, conf): x_emb = self.embedding(input) fc = self.lin_a(x_emb) mask = conf > 0 mask = paddle.cast(mask, dtype="int64") mask.stop_gradient = True emb_mask = mask.max(1).flatten() emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() emb_mask_inds.stop_gradient = True if emb_mask_inds.numel() == 0: loss_box = self.phony * 0 else: projection = self.lin_b(fc) projection = paddle.reshape(projection, shape=[-1, 1]) output = paddle.gather(projection, emb_mask_inds) target = paddle.gather(label, emb_mask_inds) loss_box = F.smooth_l1_loss(output, target, reduction='sum', delta=1.0) loss_box = loss_box / len(conf) return loss_box
def forward(self, input, mask=None): """ Args: input (paddle.Tensor) of shape (batch, seq_len, hidden_size): Tensor containing the features of the input sequence. mask (paddle.Tensor) of shape (batch, seq_len) : Tensor is a bool tensor, whose each element identifies whether the input word id is pad token or not. Defaults to `None """ weight = self.input_weight.tile( repeat_times=(paddle.shape(input)[0], 1, 1)) # tensor[batch, hidden_size, hidden_size] bias = self.bias.tile( repeat_times=(paddle.shape(input)[0], 1, 1)) # tensor[batch, 1, hidden_size] word_squish = paddle.bmm( input, weight) + bias # Shape: (batch_size, seq_len, hidden_size) att_context_vector = self.att_context_vector.tile( repeat_times=(paddle.shape(input)[0], 1, 1)) # Shape: (batch_size, hidden_size, 1) att_score = paddle.bmm( word_squish, att_context_vector) # tensor[batch_size, seq_len, 1] if mask is not None: # mask, remove the effect of 'PAD' mask = paddle.cast(mask, dtype='float32') mask = mask.unsqueeze(axis=-1) inf_tensor = paddle.full(shape=paddle.shape(mask), dtype='float32', fill_value=-INF) att_score = paddle.multiply(att_score, mask) + paddle.multiply( inf_tensor, (1 - mask)) att_weight = F.softmax(att_score, axis=1) # tensor[batch_size, seq_len, 1] reps = paddle.bmm(input.transpose(perm=(0, 2, 1)), att_weight).squeeze( -1) # Shape: (batch_size, hidden_size) return reps, att_weight
def sample_logits(embedding, bias, labels, inputs, sampler): true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) n_sample = neg_samples.shape[0] b1, b2 = labels.shape[0], labels.shape[1] all_ids = paddle.concat([paddle.reshape(labels, shape=[-1]), neg_samples]) all_w = embedding(all_ids) true_w = paddle.reshape(all_w[:-n_sample], shape=[b1, b2, -1]) sample_w = paddle.reshape(all_w[-n_sample:], shape=[n_sample, -1]) all_b = paddle.gather(bias, all_ids) true_b = paddle.reshape(all_b[:-n_sample], shape=[b1, b2]) sample_b = all_b[-n_sample:] hit = paddle.cast((labels.unsqueeze([2]) == neg_samples), dtype=global_dtype).detach() true_logits = paddle.sum(true_w * inputs, axis=-1) + true_b - true_log_probs sample_logits = paddle.transpose( paddle.matmul(sample_w, paddle.transpose(inputs, [0, 2, 1])), [0, 2, 1]) + sample_b - samp_log_probs sample_logits = sample_logits - 1e30 * hit logits = paddle.concat([true_logits.unsqueeze([2]), sample_logits], -1) return logits
def forward(self, input_ids=None, attention_mask=None, **kwargs): """ The MBartEncoder forward method, overrides the `__call__()` special method. Args: input_ids (Tensor, optional): See :class:`MBartModel`. attention_mask (Tensor, optional): See :class:`MBartModel`. Returns: Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model. Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. """ if input_ids is None: raise ValueError("Input_ids cannot be None.") inputs_embeds = self.d_model**0.5 * self.embed_tokens(input_ids) inputs_embed_pos = self.encoder_embed_positions(input_ids.shape) hidden_states = inputs_embeds + inputs_embed_pos hidden_states = self.encoder_layernorm_embedding(hidden_states) encoder_input = self.encoder_dropout(hidden_states) if attention_mask is None: attention_mask = paddle.cast( input_ids == self.pad_token_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e4 # For 2D attention_mask from tokenizer elif attention_mask.ndim == 2: attention_mask = paddle.unsqueeze( attention_mask, axis=[1, 2]).astype(paddle.get_default_dtype()) attention_mask = (1.0 - attention_mask) * -1e4 attention_mask.stop_gradient = True encoder_output = self.encoder(encoder_input, src_mask=attention_mask) return encoder_output
def run_evaluate(args, data_loader, model, criterion, iter_steps, log_writer, global_step, epoch, task_name="valid"): model.eval() all_loss = [] local_time = time.time() for eval_step, batch in enumerate(data_loader): tokens, loss_mask, labels = batch with paddle.amp.auto_cast(args.use_pure_fp16, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", ], level='O2'): preds = model(tokens) preds = paddle.cast(preds, dtype="float32") loss = criterion(preds, labels, loss_mask) all_loss.append(float(loss)) if eval_step >= iter_steps - 1: break average_loss = sum(all_loss) / len(all_loss) logger.info( "%s step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (task_name, global_step, epoch, eval_step, average_loss, iter_steps / (time.time() - local_time))) log_writer.add_scalar(task_name + "_loss", average_loss, global_step) model.train()
def forward(self, similarities_matrix, query_img_id, gallery_img_id, keep_mask): metric_dict = dict() #get cmc choosen_indices = paddle.argsort(similarities_matrix, axis=1, descending=True) gallery_labels_transpose = paddle.transpose(gallery_img_id, [1, 0]) gallery_labels_transpose = paddle.broadcast_to( gallery_labels_transpose, shape=[ choosen_indices.shape[0], gallery_labels_transpose.shape[1] ]) choosen_label = paddle.index_sample(gallery_labels_transpose, choosen_indices) equal_flag = paddle.equal(choosen_label, query_img_id) if keep_mask is not None: keep_mask = paddle.index_sample(keep_mask.astype('float32'), choosen_indices) equal_flag = paddle.logical_and(equal_flag, keep_mask.astype('bool')) equal_flag = paddle.cast(equal_flag, 'float32') real_query_num = paddle.sum(equal_flag, axis=1) real_query_num = paddle.sum( paddle.greater_than(real_query_num, paddle.to_tensor(0.)).astype("float32")) acc_sum = paddle.cumsum(equal_flag, axis=1) mask = paddle.greater_than(acc_sum, paddle.to_tensor(0.)).astype("float32") all_cmc = (paddle.sum(mask, axis=0) / real_query_num).numpy() for k in self.topk: metric_dict["recall{}".format(k)] = all_cmc[k - 1] return metric_dict
def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, input_mask=None, head_mask=None, inputs_embeds=None, use_mems_train=False, use_mems_eval=False, output_attentions=False, output_hidden_states=False, return_dict=False, ): if self.training: use_mems = use_mems_train else: use_mems = use_mems_eval # The original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # but we want a unified interface in the library with the batch size on the first dimension # so we move here the first dimension (batch) to the end if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_ids = paddle.transpose(input_ids, perm=[1, 0]) qlen, bsz = input_ids.shape[0], input_ids.shape[1] elif inputs_embeds is not None: inputs_embeds = paddle.transpose(inputs_embeds, perm=[1, 0]) qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") token_type_ids = token_type_ids.transpose( [1, 0]) if token_type_ids is not None else None input_mask = input_mask.transpose( [1, 0]) if input_mask is not None else None attention_mask = attention_mask.transpose( [1, 0]) if attention_mask is not None else None perm_mask = perm_mask.transpose([1, 2, 0 ]) if perm_mask is not None else None target_mapping = target_mapping.transpose( [1, 2, 0]) if target_mapping is not None else None mlen = mems[0].shape[ 0] if mems is not None and mems[0] is not None else 0 klen = mlen + qlen # Attention mask # Causal attention mask if self.attn_type == "uni": attn_mask = self.create_mask(qlen, mlen) attn_mask = paddle.unsqueeze(attn_mask, axis=[2, 3]) elif self.attn_type == "bi": attn_mask = None else: raise ValueError("Unsupported attention type: {}".format( self.attn_type)) # Data mask: input mask & perm mask assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " "or attention_mask (uses 0 for padding, added for compatibility with BERT). Please choose one." if input_mask is None and attention_mask is not None: input_mask = 1.0 - attention_mask if input_mask is not None and perm_mask is not None: data_mask = paddle.unsqueeze(input_mask, axis=0) + perm_mask elif input_mask is not None and perm_mask is None: data_mask = paddle.unsqueeze(input_mask, axis=0) elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # All mems can be attended to if mlen > 0: mems_mask = paddle.cast(paddle.zeros( [data_mask.shape[0], mlen, bsz]), dtype=dtype_float) data_mask = paddle.concat([mems_mask, data_mask], axis=1) if attn_mask is None: attn_mask = paddle.unsqueeze(data_mask, axis=-1) else: attn_mask += paddle.unsqueeze(data_mask, axis=-1) if attn_mask is not None: attn_mask = paddle.cast((attn_mask > 0), dtype=dtype_float) if attn_mask is not None: non_tgt_mask = paddle.cast(-paddle.eye(qlen), dtype=dtype_float) if mlen > 0: non_tgt_mask = paddle.concat([ paddle.cast(paddle.zeros([qlen, mlen]), dtype=dtype_float), non_tgt_mask ], axis=-1) non_tgt_mask = paddle.cast(( (attn_mask + paddle.unsqueeze(non_tgt_mask, axis=[2, 3])) > 0), dtype=dtype_float) else: non_tgt_mask = None # Word embeddings and prepare h & g hidden states if inputs_embeds is not None: word_emb_k = inputs_embeds else: word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k) if target_mapping is not None: word_emb_q = self.mask_emb.expand( [target_mapping.shape[0], bsz, -1]) output_g = self.dropout(word_emb_q) else: output_g = None # Segment embedding if token_type_ids is not None: # Convert `token_type_ids` to one-hot `seg_mat` if mlen > 0: mem_pad = paddle.zeros(shape=[mlen, bsz], dtype='int64') cat_ids = paddle.concat(x=[mem_pad, token_type_ids], axis=0) else: cat_ids = token_type_ids # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = paddle.cast(paddle.unsqueeze(token_type_ids, axis=1) != paddle.unsqueeze(cat_ids, axis=0), dtype='int64') seg_mat = paddle.cast(F.one_hot(seg_mat, num_classes=2), dtype=dtype_float) else: seg_mat = None # Positional encoding pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) pos_emb = self.dropout(pos_emb) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # Attention_probs has shape bsz x n_heads x N x N # Input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # And head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( 0).unsqueeze(0) head_mask = head_mask.expand([self.n_layer, -1, -1, -1, -1]) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) else: head_mask = [None] * self.n_layer new_mems = () if mems is None: mems = [None] * len(self.layer) attentions = [] if output_attentions else None hidden_states = [] if output_hidden_states else None for i, layer_module in enumerate(self.layer): if use_mems: # Cache new mems new_mems = new_mems + (self.cache_mem(output_h, mems[i]), ) if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) outputs = layer_module( output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, head_mask=head_mask[i], output_attentions=output_attentions, ) output_h, output_g = outputs[:2] if output_attentions: attentions.append(outputs[2]) # Add last hidden state if output_hidden_states: hidden_states.append(( output_h, output_g) if output_g is not None else output_h) output = self.dropout(output_g if output_g is not None else output_h) # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) output = paddle.transpose(output, perm=[1, 0, 2]) if not use_mems: new_mems = None if output_hidden_states: if output_g is not None: hidden_states = tuple( paddle.transpose(h, perm=[1, 0, 2]) for hs in hidden_states for h in hs) else: hidden_states = tuple( paddle.transpose(hs, perm=[1, 0, 2]) for hs in hidden_states) if output_attentions: if target_mapping is not None: # When target_mapping is provided, there are 2-tuple of attentions attentions = tuple( tuple( paddle.transpose(att_stream, perm=[2, 3, 0, 1]) for att_stream in t) for t in attentions) else: attentions = tuple( paddle.transpose(t, perm=[2, 3, 0, 1]) for t in attentions) if not return_dict: return tuple( v for v in [output, new_mems, hidden_states, attentions] if v is not None) return { "last_hidden_state": output, "mems": new_mems, "hidden_states": hidden_states, "attentions": attentions, }
def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) if isinstance(param_and_grad, dict): param_and_grad = self._update_param_group(param_and_grad) param, grad = param_and_grad # Whether we should do weight decay for the parameter. with_decay = True if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): with_decay = False moment1 = self._get_accumulator(self._moment1_acc_str, param_and_grad[0]) moment2 = self._get_accumulator(self._moment2_acc_str, param_and_grad[0]) beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param_and_grad[0]) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, param_and_grad[0]) find_master = self._multi_precision and param_and_grad[ 0].dtype == core.VarDesc.VarType.FP16 master_weight = (self._master_weights[param_and_grad[0].name] if find_master else None) lr = self._create_param_lr(param_and_grad) # create the adamw optimize op if framework._non_static_mode(): lr_ratio_ = 1. if self._lr_ratio is None else self._lr_ratio( param_and_grad[0]) _beta1 = self._beta1 if not isinstance( self._beta1, Variable) else self._beta1.numpy().item(0) _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) lr = paddle.cast(lr, dtype="float32") _, _, _, _, _, _ = _C_ops.adamw( param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, 'beta2', _beta2, "with_decay", with_decay, 'coeff', self._coeff, 'multi_precision', find_master, 'lr_ratio', lr_ratio_) return None inputs = { "Param": [param_and_grad[0]], "Grad": [param_and_grad[1]], "LearningRate": [lr], "Moment1": [moment1], "Moment2": [moment2], "Beta1Pow": [beta1_pow_acc], "Beta2Pow": [beta2_pow_acc], } # Pass found_inf to adamw, to skip update for not only param, but also momentum and beta_pow found_inf = self._get_auxiliary_var('found_inf') if found_inf: inputs['SkipUpdate'] = found_inf outputs = { "ParamOut": [param_and_grad[0]], "Moment1Out": [moment1], "Moment2Out": [moment2], "Beta1PowOut": [beta1_pow_acc], "Beta2PowOut": [beta2_pow_acc], } attrs = { "lazy_mode": self._lazy_mode, "min_row_size_to_use_multithread": 1000, "multi_precision": find_master, "with_decay": with_decay, "coeff": self._coeff, "lr_ratio": 1. if self._lr_ratio is None else self._lr_ratio(param_and_grad[0]) } if isinstance(self._beta1, Variable): inputs['Beta1Tensor'] = self._beta1 else: attrs['beta1'] = self._beta1 if isinstance(self._beta2, Variable): inputs['Beta2Tensor'] = self._beta2 else: attrs['beta2'] = self._beta2 if isinstance(self._epsilon, Variable): inputs['EpsilonTensor'] = self._epsilon else: attrs['epsilon'] = self._epsilon if find_master: inputs["MasterParam"] = master_weight outputs["MasterParamOut"] = master_weight adamw_op = block.append_op( type=self.type, inputs=inputs, outputs=outputs, attrs=attrs, stop_gradient=True) return adamw_op
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None, attn_mask=None): """ Inputs of forward function query: [target length, batch size, embed dim] key: [sequence length, batch size, embed dim] value: [sequence length, batch size, embed dim] key_padding_mask: if True, mask padding based on batch size incremental_state: if provided, previous time steps are cashed need_weights: output attn_output_weights static_kv: key and value are static Outputs of forward function attn_output: [target length, batch size, embed dim] attn_output_weights: [batch size, target length, sequence length] """ q_shape = paddle.shape(query) src_shape = paddle.shape(key) q = self._in_proj_q(query) k = self._in_proj_k(key) v = self._in_proj_v(value) q *= self.scaling q = paddle.transpose( paddle.reshape( q, [q_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) k = paddle.transpose( paddle.reshape( k, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) v = paddle.transpose( paddle.reshape( v, [src_shape[0], q_shape[1], self.num_heads, self.head_dim]), [1, 2, 0, 3]) if key_padding_mask is not None: assert key_padding_mask.shape[0] == q_shape[1] assert key_padding_mask.shape[1] == src_shape[0] attn_output_weights = paddle.matmul(q, paddle.transpose(k, [0, 1, 3, 2])) if attn_mask is not None: attn_mask = paddle.unsqueeze(paddle.unsqueeze(attn_mask, 0), 0) attn_output_weights += attn_mask if key_padding_mask is not None: attn_output_weights = paddle.reshape( attn_output_weights, [q_shape[1], self.num_heads, q_shape[0], src_shape[0]]) key = paddle.unsqueeze(paddle.unsqueeze(key_padding_mask, 1), 2) key = paddle.cast(key, 'float32') y = paddle.full(shape=paddle.shape(key), dtype='float32', fill_value='-inf') y = paddle.where(key == 0., key, y) attn_output_weights += y attn_output_weights = F.softmax( attn_output_weights.astype('float32'), axis=-1, dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 else attn_output_weights.dtype) attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training) attn_output = paddle.matmul(attn_output_weights, v) attn_output = paddle.reshape( paddle.transpose(attn_output, [2, 0, 1, 3]), [q_shape[0], q_shape[1], self.embed_dim]) attn_output = self.out_proj(attn_output) return attn_output
def randint_like(x, low=0, high=None, dtype=None, name=None): """ This OP returns a Tensor filled with random integers from a discrete uniform distribution in the range [``low``, ``high``), with the same shape as ``x``. (use ``dtype`` if ``dtype`` is not None) If ``high`` is None (the default), the range is [0, ``low``). Args: x (Tensor): The input tensor which specifies shape. The dtype of ``x`` can be bool, int32, int64, float16, float32, float64. low (int): The lower bound on the range of random values to generate. The ``low`` is included in the range. If ``high`` is None, the range is [0, ``low``). Default is 0. high (int, optional): The upper bound on the range of random values to generate, the ``high`` is excluded in the range. Default is None (see above for behavior if high = None). Default is None. dtype (str|np.dtype, optional): The data type of the output tensor. Supported data types: bool, int32, int64, float16, float32, float64. If ``dytpe`` is None, the data type is the same as x's data type. Default is None. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: Tensor: A Tensor filled with random integers from a discrete uniform distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``. Examples: .. code-block:: python import paddle # example 1: # dtype is None and the dtype of x is float16 x = paddle.zeros((1,2)).astype("float16") out1 = paddle.randint_like(x, low=-5, high=5) print(out1) print(out1.dtype) # [[0, -3]] # random # paddle.float16 # example 2: # dtype is None and the dtype of x is float32 x = paddle.zeros((1,2)).astype("float32") out2 = paddle.randint_like(x, low=-5, high=5) print(out2) print(out2.dtype) # [[0, -3]] # random # paddle.float32 # example 3: # dtype is None and the dtype of x is float64 x = paddle.zeros((1,2)).astype("float64") out3 = paddle.randint_like(x, low=-5, high=5) print(out3) print(out3.dtype) # [[0, -3]] # random # paddle.float64 # example 4: # dtype is None and the dtype of x is int32 x = paddle.zeros((1,2)).astype("int32") out4 = paddle.randint_like(x, low=-5, high=5) print(out4) print(out4.dtype) # [[0, -3]] # random # paddle.int32 # example 5: # dtype is None and the dtype of x is int64 x = paddle.zeros((1,2)).astype("int64") out5 = paddle.randint_like(x, low=-5, high=5) print(out5) print(out5.dtype) # [[0, -3]] # random # paddle.int64 # example 6: # dtype is float64 and the dtype of x is float32 x = paddle.zeros((1,2)).astype("float32") out6 = paddle.randint_like(x, low=-5, high=5, dtype="float64") print(out6) print(out6.dtype) # [[0, -1]] # random # paddle.float64 # example 7: # dtype is bool and the dtype of x is float32 x = paddle.zeros((1,2)).astype("float32") out7 = paddle.randint_like(x, low=-5, high=5, dtype="bool") print(out7) print(out7.dtype) # [[0, -1]] # random # paddle.bool # example 8: # dtype is int32 and the dtype of x is float32 x = paddle.zeros((1,2)).astype("float32") out8 = paddle.randint_like(x, low=-5, high=5, dtype="int32") print(out8) print(out8.dtype) # [[0, -1]] # random # paddle.int32 # example 9: # dtype is int64 and the dtype of x is float32 x = paddle.zeros((1,2)).astype("float32") out9 = paddle.randint_like(x, low=-5, high=5, dtype="int64") print(out9) print(out9.dtype) # [[0, -1]] # random # paddle.int64 # example 10: # dtype is int64 and the dtype of x is bool x = paddle.zeros((1,2)).astype("bool") out10 = paddle.randint_like(x, low=-5, high=5, dtype="int64") print(out10) print(out10.dtype) # [[0, -1]] # random # paddle.int64 """ if high is None: if low <= 0: raise ValueError( "If high is None, low must be greater than 0, but received low = {0}." .format(low)) high = low low = 0 if dtype is None: dtype = x.dtype if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) shape = x.shape if low >= high: raise ValueError( "randint_like's low must less then high, but received low = {0}, " "high = {1}".format(low, high)) if in_dygraph_mode(): shape = utils.convert_shape_to_list(shape) out = _C_ops.randint('shape', shape, 'low', low, 'high', high, 'seed', 0, 'dtype', core.VarDesc.VarType.INT64) out = paddle.cast(out, dtype) return out check_shape(shape, 'randint_like') check_dtype(dtype, 'dtype', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'randint_like') inputs = dict() attrs = { 'low': low, 'high': high, 'seed': 0, 'dtype': core.VarDesc.VarType.INT64 } utils.get_shape_tensor_inputs(inputs=inputs, attrs=attrs, shape=shape, op_type='randint_like') helper = LayerHelper("randint", **locals()) out = helper.create_variable_for_type_inference( dtype=core.VarDesc.VarType.INT64) helper.append_op(type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs) out.stop_gradient = True out = paddle.cast(out, dtype) return out
def normal(mean=0.0, std=1.0, shape=None, name=None): """ This OP returns a Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` (standard deviation) . If ``mean`` is a Tensor, the output Tensor has the same shape and data type as ``mean``. If ``mean`` is not a Tensor and ``std`` is a Tensor, the output Tensor has the same shape and data type as ``std``. If ``mean`` and ``std`` are not a Tensor, the output Tensor has the same shape as ``shape``, with data type float32. If ``mean`` and ``std`` are Tensor, the num of elements of ``mean`` and ``std`` should be the same. Args: mean (float|Tensor, optional): The mean of the output Tensor's normal distribution. If ``mean`` is float, all elements of the output Tensor shared the same mean. If ``mean`` is a Tensor(data type supports float32, float64), it has per-element means. Default is 0.0 std (float|Tensor, optional): The standard deviation of the output Tensor's normal distribution. If ``std`` is float, all elements of the output Tensor shared the same standard deviation. If ``std`` is a Tensor(data type supports float32, float64), it has per-element standard deviations. Defaule is 1.0 shape (list|tuple|Tensor, optional): The shape of the output Tensor. If ``shape`` is a list or tuple, the elements of it should be integers or Tensors (with the shape [1], and the data type int32 or int64). If ``shape`` is a Tensor, it should be a 1-D Tensor(with the data type int32 or int64). If ``mean`` or ``std`` is a Tensor, the shape of the output Tensor is the same as ``mean`` or ``std`` , attr ``shape`` is ignored. Default is None name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: A Tensor filled with random values sampled from a normal distribution with ``mean`` and ``std`` . Examples: .. code-block:: python import paddle out1 = paddle.normal(shape=[2, 3]) # [[ 0.17501129 0.32364586 1.561118 ] # random # [-1.7232178 1.1545963 -0.76156676]] # random mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) out2 = paddle.normal(mean=mean_tensor) # [ 0.18644847 -1.19434458 3.93694787] # random std_tensor = paddle.to_tensor([1.0, 2.0, 3.0]) out3 = paddle.normal(mean=mean_tensor, std=std_tensor) # [1.00780561 3.78457445 5.81058198] # random """ if not in_dygraph_mode(): check_type(mean, 'mean', (int, float, Variable), 'normal') check_type(std, 'std', (int, float, Variable), 'normal') if isinstance(mean, Variable): check_dtype( mean.dtype, 'mean', ['float32', 'float64'], 'normal', "If mean is Tensor, it's data type only support float32, float64." ) if isinstance(std, Variable): check_dtype( std.dtype, 'std', ['float32', 'float64'], 'normal', "If std is Tensor, it's data type only support float32, float64." ) if shape is not None: check_shape(shape, 'normal') if isinstance(mean, Variable): if isinstance(std, Variable): if std.dtype != mean.dtype: std = paddle.cast(std, mean.dtype) mean_shape = paddle.shape(mean) std = paddle.reshape(std, mean_shape) else: std = float(std) out = standard_normal(paddle.shape(mean), mean.dtype, name) elif isinstance(std, Variable): mean = float(mean) out = standard_normal(paddle.shape(std), std.dtype, name) else: return gaussian(shape=shape, mean=mean, std=std, name=name) out = out * std + mean if not in_dygraph_mode(): out.stop_grediant = True return out
def create_loss(self, raw_pred, label): loss = paddle.nn.functional.log_loss(input=raw_pred, label=paddle.cast( label, "float32")) loss = paddle.mean(loss) return loss
def forward(self, bn, observed, initial_position): if initial_position: observed_ = {**initial_position, **observed} else: observed_ = observed bn.forward(observed_) q0 = [[k, v.tensor] for k, v in bn.nodes.items() if k not in observed.keys()] normals = [[k, Normal(mean=fluid.layers.zeros(shape=v.shape, dtype='float32'), std=1)]\ for k,v in q0] for e in range(self.iters): q1 = [[k, paddle.assign(v)] for k, v in q0] p0 = [[k, v.sample()] for k, v in normals] p1 = [[k, paddle.assign(v)] for k, v in p0] ###### leapfrog integrator for s in range(self.n_leapfrogs): observed_ = {**dict(q1), **observed} bn.forward(observed_) log_joint_ = bn.log_joint() q_v = [v for _, v in q1] q_grad = paddle.grad(log_joint_, q_v) for i, _ in enumerate(q_grad): p1[i][1] = p1[i][1] + self.step_size * q_grad[i] / 2.0 q1[i][1] = q1[i][1] + self.step_size * p1[i][1] p1[i][1] = p1[i][1].detach() p1[i][1].stop_gradient = False q1[i][1] = q1[i][1].detach() q1[i][1].stop_gradient = False observed_ = {**dict(q1), **observed} q_v = [v for _, v in q1] bn.forward(observed_) #print(dir(bn)) log_joint_ = bn.log_joint() q_grad = paddle.grad(log_joint_, q_v) for i, _ in enumerate(q_grad): p1[i][1] = p1[i][1] + self.step_size * q_grad[i] / 2.0 p1[i][1] = p1[i][1].detach() p1[i][1].stop_gradient = False ###### reverse p1 for i, _ in enumerate(p1): p1[i][1] = -1 * p1[i][1] ###### M-H step observed_ = {**dict(q0), **observed} bn.forward(observed_) log_prob_q0 = bn.log_joint() log_prob_p0 = None for i, _ in enumerate(p0): len_q = len(log_prob_q0.shape) len_p = len(p0[i][1].shape) assert (len_p >= len_q) if len_p > len_q: dims = [i for i in range(len_q - len_p, 0)] try: log_prob_p0 = log_prob_p0 + fluid.layers.reduce_sum( p0[i][1], dims) except: log_prob_p0 = fluid.layers.reduce_sum(p0[i][1], dims) else: try: log_prob_p0 = log_prob_p0 + p0[i][1] except: log_prob_p0 = p0[i][1] observed_ = {**dict(q1), **observed} bn.forward(observed_) log_prob_q1 = bn.log_joint() log_prob_p1 = None for i, _ in enumerate(p1): len_q = len(log_prob_q0.shape) len_p = len(p1[i][1].shape) assert (len_p >= len_q) if len_p > len_q: dims = [i for i in range(len_q - len_p, 0)] try: log_prob_p1 = log_prob_p1 + fluid.layers.reduce_sum( p1[i][1], dims) except: log_prob_p1 = fluid.layers.reduce_sum(p1[i][1], dims) else: try: log_prob_p1 = log_prob_p1 + p1[i][1] except: log_prob_p1 = p1[i][1] assert (log_prob_q0.shape == log_prob_p1.shape) acceptance = log_prob_q1 + log_prob_p1 - log_prob_q0 - log_prob_p0 #acceptance = log_prob_q0 + log_prob_p0 - log_prob_q1 - log_prob_p1 for i, _ in enumerate(q1): event = paddle.to_tensor(np.log( np.random.rand(*q1[i][1].shape)), dtype='float32') #q0[i][1] = paddle.where(acceptance>=event, q1[i][1], q0[i][1]) a = paddle.cast(acceptance > event, dtype='float32') q0[i][1] = paddle.assign(a * q1[i][1] + (1.0 - a) * q0[i][1]) #print(q0[0][1]) #print(dir(bn)) #print(bn.clear_gradients()) sample_ = dict(q0) return sample_
def where(condition, x=None, y=None, name=None): r""" Return a tensor of elements selected from either $x$ or $y$, depending on $condition$. **Note**: ``paddle.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``. .. math:: out_i = \begin{cases} x_i, \quad \text{if} \ condition_i \ is \ True \\ y_i, \quad \text{if} \ condition_i \ is \ False \\ \end{cases} Args: condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y. x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: Tensor: A Tensor with the same data dype as x. Examples: .. code-block:: python import paddle x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2]) y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0]) out = paddle.where(x>1, x, y) print(out) #out: [1.0, 1.0, 3.2, 1.2] out = paddle.where(x>1) print(out) #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True, # [[2], # [3]]),) """ if np.isscalar(x): x = paddle.full([1], x, np.array([x]).dtype.name) if np.isscalar(y): y = paddle.full([1], y, np.array([y]).dtype.name) if x is None and y is None: return nonzero(condition, as_tuple=True) if x is None or y is None: raise ValueError("either both or neither of x and y should be given") if not paddle.in_dynamic_mode(): check_variable_and_dtype(condition, 'condition', ['bool'], 'where') check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], 'where') check_variable_and_dtype(y, 'y', ['float32', 'float64', 'int32', 'int64'], 'where') condition_shape = list(condition.shape) x_shape = list(x.shape) y_shape = list(y.shape) if x_shape == y_shape and condition_shape == x_shape: broadcast_condition = condition broadcast_x = x broadcast_y = y else: if core.is_compiled_with_xpu(): cond_int = paddle.cast(condition, x.dtype) cond_not_int = paddle.cast(logical_not(condition), x.dtype) out1 = paddle.multiply(x, cond_int) out2 = paddle.multiply(y, cond_not_int) out = paddle.add(out1, out2) return out zeros_like_x = paddle.zeros_like(x) zeros_like_y = paddle.zeros_like(y) zeros_like_condition = paddle.zeros_like(condition) zeros_like_condition = paddle.cast(zeros_like_condition, x.dtype) cast_cond = paddle.cast(condition, x.dtype) broadcast_zeros = paddle.add(zeros_like_x, zeros_like_y) broadcast_zeros = paddle.add(broadcast_zeros, zeros_like_condition) broadcast_x = paddle.add(x, broadcast_zeros) broadcast_y = paddle.add(y, broadcast_zeros) broadcast_condition = paddle.add(cast_cond, broadcast_zeros) broadcast_condition = paddle.cast(broadcast_condition, 'bool') if in_dygraph_mode(): return _C_ops.final_state_where(broadcast_condition, broadcast_x, broadcast_y) else: if _in_legacy_dygraph(): return _C_ops.where(broadcast_condition, broadcast_x, broadcast_y) else: helper = LayerHelper("where", **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op(type='where', inputs={ 'Condition': broadcast_condition, 'X': broadcast_x, 'Y': broadcast_y }, outputs={'Out': [out]}) return out
def __call__(self, seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=None): # sort and keep top nms_pre sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) seg_masks = paddle.gather(seg_masks, index=sort_inds) seg_preds = paddle.gather(seg_preds, index=sort_inds) sum_masks = paddle.gather(sum_masks, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) # inter. inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) n_samples = paddle.shape(cate_labels) # union. sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) # iou. iou_matrix = (inter_matrix / (sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) iou_matrix = paddle.triu(iou_matrix, diagonal=1) # label_specific matrix. cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) label_matrix = paddle.cast( (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), 'float32') label_matrix = paddle.triu(label_matrix, diagonal=1) # IoU compensation compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) compensate_iou = paddle.expand(compensate_iou, shape=[n_samples, n_samples]) compensate_iou = paddle.transpose(compensate_iou, [1, 0]) # IoU decay decay_iou = iou_matrix * label_matrix # matrix nms if self.kernel == 'gaussian': decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) compensate_matrix = paddle.exp(-1 * self.sigma * (compensate_iou**2)) decay_coefficient = paddle.min(decay_matrix / compensate_matrix, axis=0) elif self.kernel == 'linear': decay_matrix = (1 - decay_iou) / (1 - compensate_iou) decay_coefficient = paddle.min(decay_matrix, axis=0) else: raise NotImplementedError # update the score. cate_scores = cate_scores * decay_coefficient y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, y) keep = paddle.nonzero(keep) keep = paddle.squeeze(keep, axis=[1]) # Prevent empty and increase fake data keep = paddle.concat( [keep, paddle.cast(paddle.shape(cate_scores)[0] - 1, 'int64')]) seg_preds = paddle.gather(seg_preds, index=keep) cate_scores = paddle.gather(cate_scores, index=keep) cate_labels = paddle.gather(cate_labels, index=keep) # sort and keep top_k sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) seg_preds = paddle.gather(seg_preds, index=sort_inds) cate_scores = paddle.gather(cate_scores, index=sort_inds) cate_labels = paddle.gather(cate_labels, index=sort_inds) return seg_preds, cate_scores, cate_labels
def dec2bin(self, x, bits): mask = paddle.arange(bits - 1, -1, -1, dtype=paddle.float32) mask = paddle.cast(2**mask, dtype=paddle.int64) return paddle.not_equal( x.unsqueeze(-1).bitwise_and(mask), paddle.full(shape=[1], fill_value=0, dtype=paddle.int64))
def median(x, axis=None, keepdim=False, name=None): """ Compute the median along the specified axis. Args: x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64. axis (int, optional): The axis along which to perform median calculations ``axis`` should be int. ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . If ``axis`` is less than 0, it works the same way as :math:`axis + D`. If ``axis`` is None, median is calculated over all elements of ``x``. Default is None. keepdim (bool, optional): Whether to reserve the reduced dimension(s) in the output Tensor. If ``keepdim`` is True, the dimensions of the output Tensor is the same as ``x`` except in the reduced dimensions(it is of size 1 in this case). Otherwise, the shape of the output Tensor is squeezed in ``axis`` . Default is False. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: Tensor, results of median along ``axis`` of ``x``. If data type of ``x`` is float64, data type of results will be float64, otherwise data type will be float32. Examples: .. code-block:: python import paddle x = paddle.arange(12).reshape([3, 4]) # x is [[0 , 1 , 2 , 3 ], # [4 , 5 , 6 , 7 ], # [8 , 9 , 10, 11]] y1 = paddle.median(x) # y1 is [5.5] y2 = paddle.median(x, axis=0) # y2 is [4., 5., 6., 7.] y3 = paddle.median(x, axis=1) # y3 is [1.5, 5.5, 9.5] y4 = paddle.median(x, axis=0, keepdim=True) # y4 is [[4., 5., 6., 7.]] """ if not isinstance(x, Variable): raise TypeError("In median, the input x should be a Tensor.") is_flatten = axis is None dims = len(x.shape) if is_flatten: x = paddle.flatten(x) axis = 0 else: if not isinstance(axis, int) or not (axis < dims and axis >= -dims): raise ValueError( "In median, axis should be none or an integer in range [-rank(x), rank(x))." ) if axis < 0: axis += dims sz = x.shape[axis] kth = sz >> 1 tensor_topk, idx = paddle.topk(x, kth + 1, axis=axis, largest=False) dtype = 'float64' if x.dtype == core.VarDesc.VarType.FP64 else 'float32' if sz & 1 == 0: out_tensor = paddle.slice( tensor_topk, axes=[axis], starts=[kth - 1], ends=[kth]) + paddle.slice( tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]) out_tensor = paddle.cast(out_tensor, dtype=dtype) / 2 else: out_tensor = paddle.cast(paddle.slice(tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]), dtype=dtype) if not keepdim or is_flatten: if not is_flatten: newshape = x.shape[:axis] + x.shape[axis + 1:] elif not keepdim: newshape = [1] else: newshape = [1] * dims else: newshape = out_tensor.shape out_tensor = out_tensor.reshape(newshape, name=name) return out_tensor
def create_loss(self, pred, label): cost = paddle.nn.functional.log_loss(input=pred, label=paddle.cast( label, dtype="float32")) avg_cost = paddle.mean(x=cost) return avg_cost
def beam_search(self, x, beam_width, eos, embed): def _inflate(tensor, times, dim): repeat_dims = [1] * tensor.dim() repeat_dims[dim] = times output = paddle.tile(tensor, repeat_dims) return output # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py batch_size, l, d = x.shape x = paddle.tile(paddle.transpose(x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1]) inflated_encoder_feats = paddle.reshape( paddle.transpose(x, perm=[1, 0, 2, 3]), [-1, l, d]) # Initialize the decoder state = self.decoder.get_initial_state(embed, tile_times=beam_width) pos_index = paddle.reshape(paddle.arange(batch_size) * beam_width, shape=[-1, 1]) # Initialize the scores sequence_scores = paddle.full(shape=[batch_size * beam_width, 1], fill_value=-float('Inf')) index = [i * beam_width for i in range(0, batch_size)] sequence_scores[index] = 0.0 # Initialize the input vector y_prev = paddle.full(shape=[batch_size * beam_width], fill_value=self.num_classes) # Store decisions for backtracking stored_scores = list() stored_predecessors = list() stored_emitted_symbols = list() for i in range(self.max_len_labels): output, state = self.decoder(inflated_encoder_feats, state, y_prev) state = paddle.unsqueeze(state, axis=0) log_softmax_output = paddle.nn.functional.log_softmax(output, axis=1) sequence_scores = _inflate(sequence_scores, self.num_classes, 1) sequence_scores += log_softmax_output scores, candidates = paddle.topk(paddle.reshape( sequence_scores, [batch_size, -1]), beam_width, axis=1) # Reshape input = (bk, 1) and sequence_scores = (bk, 1) y_prev = paddle.reshape(candidates % self.num_classes, shape=[batch_size * beam_width]) sequence_scores = paddle.reshape( scores, shape=[batch_size * beam_width, 1]) # Update fields for next timestep pos_index = paddle.expand_as(pos_index, candidates) predecessors = paddle.cast(candidates / self.num_classes + pos_index, dtype='int64') predecessors = paddle.reshape(predecessors, shape=[batch_size * beam_width, 1]) state = paddle.index_select(state, index=predecessors.squeeze(), axis=1) # Update sequence socres and erase scores for <eos> symbol so that they aren't expanded stored_scores.append(sequence_scores.clone()) y_prev = paddle.reshape(y_prev, shape=[-1, 1]) eos_prev = paddle.full_like(y_prev, fill_value=eos) mask = eos_prev == y_prev mask = paddle.nonzero(mask) if mask.dim() > 0: sequence_scores = sequence_scores.numpy() mask = mask.numpy() sequence_scores[mask] = -float('inf') sequence_scores = paddle.to_tensor(sequence_scores) # Cache results for backtracking stored_predecessors.append(predecessors) y_prev = paddle.squeeze(y_prev) stored_emitted_symbols.append(y_prev) # Do backtracking to return the optimal values #====== backtrak ======# # Initialize return variables given different types p = list() l = [[self.max_len_labels] * beam_width for _ in range(batch_size) ] # Placeholder for lengths of top-k sequences # the last step output of the beams are not sorted # thus they are sorted here sorted_score, sorted_idx = paddle.topk( paddle.reshape(stored_scores[-1], shape=[batch_size, beam_width]), beam_width) # initialize the sequence scores with the sorted last step beam scores s = sorted_score.clone() batch_eos_found = [0] * batch_size # the number of EOS found # in the backward loop below for each batch t = self.max_len_labels - 1 # initialize the back pointer with the sorted order of the last step beams. # add pos_index for indexing variable with b*k as the first dimension. t_predecessors = paddle.reshape(sorted_idx + pos_index.expand_as(sorted_idx), shape=[batch_size * beam_width]) while t >= 0: # Re-order the variables with the back pointer current_symbol = paddle.index_select(stored_emitted_symbols[t], index=t_predecessors, axis=0) t_predecessors = paddle.index_select( stored_predecessors[t].squeeze(), index=t_predecessors, axis=0) eos_indices = stored_emitted_symbols[t] == eos eos_indices = paddle.nonzero(eos_indices) if eos_indices.dim() > 0: for i in range(eos_indices.shape[0] - 1, -1, -1): # Indices of the EOS symbol for both variables # with b*k as the first dimension, and b, k for # the first two dimensions idx = eos_indices[i] b_idx = int(idx[0] / beam_width) # The indices of the replacing position # according to the replacement strategy noted above res_k_idx = beam_width - (batch_eos_found[b_idx] % beam_width) - 1 batch_eos_found[b_idx] += 1 res_idx = b_idx * beam_width + res_k_idx # Replace the old information in return variables # with the new ended sequence information t_predecessors[res_idx] = stored_predecessors[t][idx[0]] current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]] s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0] l[b_idx][res_k_idx] = t + 1 # record the back tracked results p.append(current_symbol) t -= 1 # Sort and re-order again as the added ended sequences may change # the order (very unlikely) s, re_sorted_idx = s.topk(beam_width) for b_idx in range(batch_size): l[b_idx] = [ l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :] ] re_sorted_idx = paddle.reshape( re_sorted_idx + pos_index.expand_as(re_sorted_idx), [batch_size * beam_width]) # Reverse the sequences and re-order at the same time # It is reversed because the backtracking happens in reverse time order p = [ paddle.reshape(paddle.index_select(step, re_sorted_idx, 0), shape=[batch_size, beam_width, -1]) for step in reversed(p) ] p = paddle.concat(p, -1)[:, 0, :] return p, paddle.ones_like(p)
def do_eval(args): paddle.set_device(args.device) model_class, tokenizer_class = MODEL_CLASSES["gpt"] tokenizer = tokenizer_class.from_pretrained(args.model_name) if args.init_checkpoint_path is not None: model = GPTForPretraining( GPTModel( **model_class.pretrained_init_configuration[args.model_name])) logger.info("Load model checkpoint from %s" % args.init_checkpoint_path) model_dict = paddle.load(os.path.join(args.init_checkpoint_path)) model.set_dict(model_dict) else: model = model_class.from_pretrained(args.model_name) tic_eval = time.time() eval_data_loader = create_eval_dataset(args) model.eval() total_score = 0 score_name = "loss" if not args.cloze_eval else "number correct" with paddle.no_grad(): for step, batch in enumerate(eval_data_loader): tokens, loss_mask, attention_mask, position_ids, labels = batch preds = model(tokens, position_ids, attention_mask) if not args.cloze_eval: masked_lm_loss = paddle.nn.functional.cross_entropy( preds, labels, reduction="none") loss = paddle.sum(masked_lm_loss * loss_mask) total_score += loss.numpy() / (args.num_tokenized_tokens - 1) else: outputs = paddle.argmax(preds, -1) acc = paddle.cast(outputs == labels, 'float32') acc = paddle.where(paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc)) acc = paddle.sum(paddle.prod(acc, -1)) total_score += acc.numpy() if step % args.logging_steps == 0: logger.info( "step %d, batch: %d, %s: %f, speed: %.2f step/s" % (step, step, score_name, total_score, args.logging_steps / (time.time() - tic_eval))) tic_eval = time.time() if not args.cloze_eval: total_loss = float(total_score) ppl = math.exp(min(20, total_loss)) token_ratio = (args.num_tokenized_tokens - 1) / (args.num_original_tokens - 1) adjusted_ppl = math.exp(min(20, total_loss * token_ratio)) string = ' validation results on {} | '.format(args.eval_path) string += 'avg loss: {:.4E} | '.format(total_loss) string += 'ppl: {:.4E} | '.format(ppl) string += 'adjusted ppl: {:.4E} | '.format(adjusted_ppl) string += 'token ratio: {} |'.format(token_ratio) else: num_correct = float(total_score) acc = float(num_correct / args.num_examples) string = ' validation results on {} | '.format(args.eval_path) string += 'number correct: {:.4E} | '.format(num_correct) string += 'total examples: {:.4E} | '.format(args.num_examples) string += 'avg accuracy: {:.4E}'.format(acc) logger.info(string)