def forward(self, input): if _global_parallel_strategy == "dp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1, -1] }) q = self.q_proj(input) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(input) v = self.v_proj(input) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) return out
def greedy_search(self, src_word, max_len=256, waitk=-1, caches=None, bos_id=None): """ greedy_search uses streaming reader. It doesn't need calling encoder many times, an a sub-sentence just needs calling encoder once. So, it needsprevious state(caches) and last one of generated tokens id last time. """ src_max_len = paddle.shape(src_word)[-1] base_attn_bias = paddle.cast( src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 src_slf_attn_bias = base_attn_bias src_slf_attn_bias.stop_gradient = True trg_src_attn_bias = paddle.tile(base_attn_bias, [1, 1, 1, 1]) src_pos = paddle.cast(src_word != self.bos_id, dtype="int64") * paddle.arange(start=0, end=src_max_len) src_emb = self.src_word_embedding(src_word) src_pos_emb = self.src_pos_embedding(src_pos) src_emb = src_emb + src_pos_emb enc_input = F.dropout( src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb enc_outputs = [self.encoder(enc_input, src_mask=src_slf_attn_bias)] # constant number batch_size = enc_outputs[-1].shape[0] max_len = (enc_outputs[-1].shape[1] + 20) if max_len is None else max_len end_token_tensor = paddle.full(shape=[batch_size, 1], fill_value=self.eos_id, dtype="int64") predict_ids = [] log_probs = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="float32") if not bos_id: trg_word = paddle.full(shape=[batch_size, 1], fill_value=self.bos_id, dtype="int64") else: trg_word = paddle.full(shape=[batch_size, 1], fill_value=bos_id, dtype="int64") # init states (caches) for transformer if not caches: caches = self.decoder.gen_cache(enc_outputs[-1], do_zip=False) for i in range(max_len): trg_pos = paddle.full(shape=trg_word.shape, fill_value=i, dtype="int64") trg_emb = self.trg_word_embedding(trg_word) trg_pos_emb = self.trg_pos_embedding(trg_pos) trg_emb = trg_emb + trg_pos_emb dec_input = F.dropout( trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb if waitk < 0 or i >= len(enc_outputs): # if the decoder step is full sent or longer than all source # step, then read the whole src _e = enc_outputs[-1] dec_output, caches = self.decoder( dec_input, [_e], None, trg_src_attn_bias[:, :, :, :_e.shape[1]], caches) else: _e = enc_outputs[i] dec_output, caches = self.decoder( dec_input, [_e], None, trg_src_attn_bias[:, :, :, :_e.shape[1]], caches) dec_output = paddle.reshape(dec_output, shape=[-1, dec_output.shape[-1]]) logits = self.linear(dec_output) step_log_probs = paddle.log(F.softmax(logits, axis=-1)) log_probs = paddle.add(x=step_log_probs, y=log_probs) scores = log_probs topk_scores, topk_indices = paddle.topk(x=scores, k=1) finished = paddle.equal(topk_indices, end_token_tensor) trg_word = topk_indices log_probs = topk_scores predict_ids.append(topk_indices) if paddle.all(finished).numpy(): break predict_ids = paddle.stack(predict_ids, axis=0) finished_seq = paddle.transpose(predict_ids, [1, 2, 0]) finished_scores = topk_scores return finished_seq, finished_scores, caches
def get_prediction(self, score, delta): bbox_prob = F.softmax(score) return delta, bbox_prob
def main(args): """The main function Args: args (args): configs Raises: ValueError: if the args is invalid. """ model_config = json.load(open(args.model_config, 'r')) paddle.set_device("gpu" if args.use_cuda else "cpu") rank = 0 if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # the current process id rank = paddle.distributed.get_rank() train_dataset = SecondaryStructureDataset(base_path=args.train_data, mode='train', num_classes=3) train_loader = create_dataloader( train_dataset, mode='train', batch_size=args.batch_size, pad_token_id=ProteinTokenizer.padding_token_id) test_dataset = SecondaryStructureDataset(base_path=args.train_data, mode='test', num_classes=3) test_loader = create_dataloader( test_dataset, mode='test', batch_size=args.batch_size, pad_token_id=ProteinTokenizer.padding_token_id) if model_config["model_type"] == "transformer": model = TransformerSeqClassificationModel( vocab_size=len(ProteinTokenizer.vocab), num_class=model_config['class_num'], emb_dim=model_config['hidden_size']) elif model_config["model_type"] == "lstm": model = LstmSeqClassificationModel(vocab_size=len( ProteinTokenizer.vocab), num_class=model_config['class_num'], emb_dim=model_config['hidden_size']) else: raise ValueError("Not avaliable {}".format(model_config["model_type"])) if os.path.exists(args.init_model): param_state_dict = paddle.load(args.init_model) model.set_dict(param_state_dict) print("Loaded model parameters from %s" % args.init_model) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.warmup_steps > 0: lr_scheduler = paddle.optimizer.lr.NoamDecay( 1 / (args.warmup_steps * (args.lr ** 2)), args.warmup_steps) else: lr_scheduler = args.lr max_grad_norm = 0.1 grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=model_config['weight_decay'], grad_clip=grad_clip) criterion = paddle.nn.CrossEntropyLoss(ignore_index=-1) metric = ClassificationMetric() for epoch in range(args.epoch): loss_sum = 0 model.train() start_time = time.time() for i, (texts, seq_lens, labels) in enumerate(train_loader, start=1): labels = labels.reshape([-1, 1]) mask = labels != -1 logits = model(texts, seq_lens) logits = logits.reshape([-1, logits.shape[-1]]) # mask, remove the effect of 'PAD' mask = paddle.cast(mask, dtype='float32') inf_tensor = paddle.full(shape=mask.shape, dtype='float32', fill_value=-1. * 1e12) logits = paddle.multiply(logits, mask) + paddle.multiply( inf_tensor, (1 - mask)) probs = F.softmax(logits, axis=1) loss = criterion(probs, labels) loss_sum += loss.numpy() loss.backward() optimizer.step() optimizer.clear_gradients() probs = probs.numpy() labels = labels.numpy() metric.update(probs, labels) if i % 10 == 0: print('epoch %d, step %d, avg loss %.5f' % (epoch, i, loss_sum / 10)) metric.show() loss_sum = 0 metric.clear() if rank == 0: print('Test:') avg_loss = eval(model, test_loader, criterion, metric) print("Average loss: %.5f" % avg_loss) print("Save model epoch%d." % epoch) param_path = os.path.join(args.model, 'epoch%d' % epoch, 'saved_params.pdparams') opt_path = os.path.join(args.model, 'epoch%d' % epoch, 'saved_opt.pdopt') paddle.save(model.state_dict(), param_path) paddle.save(optimizer.state_dict(), opt_path)
def do_predict(): paddle.set_device(args.device) no_entity_label = "O" ignore_label = -1 tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=len(label_map)) print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"].replace(" ", "\002") input_ids, token_type_ids, seq_len = convert_example_to_feature( [list(sent), []], tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input_ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token] ), # token_type_ids Stack() # sequence lens ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [ encoded_inputs_list[i:i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size) ] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids, seq_lens = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=-1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()): prob_one = [ p_list[index][pid] for index, pid in enumerate(p_ids[1:seq_len - 1]) ] label_one = [id2label[pid] for pid in p_ids[1:seq_len - 1]] results.append({"probs": prob_one, "labels": label_one}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def embedded_gaussian(self, theta_x, phi_x): pairwise_weight = paddle.matmul(theta_x, phi_x) if self.use_scale: pairwise_weight /= theta_x.shape[-1]**0.5 pairwise_weight = F.softmax(pairwise_weight, -1) return pairwise_weight
def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if use_cache is False: if self.fuse: q, k, v = self._fuse_prepare_qkv(query) else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) if self.dropout: weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs)
def forward(self, w, r_emb, r_w_bias, r_bias, attn_mask=None, mems=None): qlen, bsz = w.shape[1], w.shape[0] if mems is not None: cat = paddle.concat([mems, w], 1) if self.normalize_before: w_heads = self.qkv_proj(self.layer_norm(cat)) else: w_heads = self.qkv_proj(cat) w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads, chunks=3, axis=-1) w_head_q = w_head_q[-qlen:] else: if self.normalize_before: w_heads = self.qkv_proj(self.layer_norm(w)) else: w_heads = self.qkv_proj(w) w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads, chunks=3, axis=-1) klen = w_head_k.shape[1] w_head_q = paddle.reshape(w_head_q, shape=[ w_head_q.shape[0], w_head_q.shape[1], self.n_head, self.d_head ]) w_head_k = paddle.reshape(w_head_k, shape=[ w_head_k.shape[0], w_head_k.shape[1], self.n_head, self.d_head ]) w_head_v = paddle.reshape(w_head_v, shape=[ w_head_v.shape[0], w_head_v.shape[1], self.n_head, self.d_head ]) if klen > r_emb.shape[0]: r_emb_pad = r_emb[0:1].expand(klen - r_emb.shape[0], -1, -1) r_emb = paddle.concat([r_emb_pad, r_emb], 0) r_bias_pad = r_bias[0:1].expand(klen - r_bias.shape[0], -1) r_bias = paddle.concat([r_bias_pad, r_bias], 0) else: r_emb = r_emb[-klen:] r_bias = r_bias[-klen:] rw_head_q = w_head_q + r_w_bias.unsqueeze([0]) AC = einsum('bind,bjnd->bnij', rw_head_q, w_head_k) r_emb = r_emb.unsqueeze([0]).expand([bsz, -1, -1, -1]) B_ = einsum('bind,bjnd->bnij', w_head_q, r_emb) D_ = r_bias.unsqueeze([0, 2]) BD = self._rel_shift(B_ + D_) attn_score = AC + BD attn_score = attn_score * self.scale if attn_mask is not None: attn_score = attn_score - float('inf') * attn_mask attn_prob = F.softmax(attn_score, dim=-1) attn_prob = self.attn_drop(attn_prob) attn_vec = einsum('bnij,bjnd->bind', attn_prob, w_head_v) attn_vec = paddle.reshape(attn_vec, shape=[ attn_vec.shape[0], attn_vec.shape[1], self.n_head * self.d_head ]) attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out) if self.normalize_before: output = w + attn_out else: output = self.layer_norm(w + attn_out) return output
def forward(self, logit, label): """ Forward computation. Args: logit (Tensor): Logit tensor, the data type is float32, float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. label (Tensor): Label tensor, the data type is int64. Shape is (N), where each value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is (N, D1, D2,..., Dk), k >= 1. """ if len(label.shape) != len(logit.shape): label = paddle.unsqueeze(label, 1) # get the label after ohem n, c, h, w = logit.shape label = label.reshape((-1, )) valid_mask = (label != self.ignore_index).astype('int64') num_valid = valid_mask.sum() label = label * valid_mask prob = F.softmax(logit, axis=1) prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1)) if self.min_kept < num_valid and num_valid > 0: # let the value which ignored greater than 1 prob = prob + (1 - valid_mask) # get the prob of relevant label label_onehot = F.one_hot(label, c) label_onehot = label_onehot.transpose((1, 0)) prob = prob * label_onehot prob = paddle.sum(prob, axis=0) threshold = self.thresh if self.min_kept > 0: index = prob.argsort() threshold_index = index[min(len(index), self.min_kept) - 1] threshold_index = int(threshold_index.numpy()[0]) if prob[threshold_index] > self.thresh: threshold = prob[threshold_index] kept_mask = (prob < threshold).astype('int64') label = label * kept_mask valid_mask = valid_mask * kept_mask # make the invalid region as ignore label = label + (1 - valid_mask) * self.ignore_index label = label.reshape((n, 1, h, w)) valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32') loss = F.softmax_with_cross_entropy(logit, label, ignore_index=self.ignore_index, axis=1) loss = loss * valid_mask avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS) label.stop_gradient = True valid_mask.stop_gradient = True return avg_loss
def forward(self, hidden, target, keep_order=False): assert (hidden.shape[0] == target.shape[0]) if self.num_clusters == 0: logit = self._compute_logits(hidden, self.out_layers_weight[0], self.out_layers_bias[0], self.out_projs[0]) nll = -paddle.log(F.softmax(logit, axis=-1)) idx = paddle.concat([ paddle.arange(0, nll.shape[0]).unsqueeze([1]), target.unsqueeze(1) ], axis=1) nll = paddle.gather_nd(nll, idx) else: weights, biases = [], [] for i in range(len(self.cutoffs)): if self.div_val == 1: l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] weight_i = self.out_layers_weight[0][l_idx:r_idx] bias_i = self.out_layers_bias[0][l_idx:r_idx] else: weight_i = self.out_layers_weight[i] bias_i = self.out_layers_bias[i] if i == 0: weight_i = paddle.concat([weight_i, self.cluster_weight], axis=0) bias_i = paddle.concat([bias_i, self.cluster_bias], axis=0) weights.append(weight_i) biases.append(bias_i) head_weight, head_bias, head_proj = weights[0], biases[ 0], self.out_projs[0] head_logit = self._compute_logits(hidden, head_weight, head_bias, head_proj) head_logprob = paddle.log(F.softmax(head_logit, axis=-1)) nll = paddle.zeros_like(target, dtype=hidden.dtype) offset = 0 cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] mask_i = paddle.cast( target >= l_idx, dtype=paddle.get_default_dtype()) * paddle.cast( target < r_idx, dtype="int64") indices_i = paddle.nonzero(mask_i).squeeze([1]) if paddle.numel(indices_i) == 0: continue target_i = paddle.gather(target, indices_i, axis=0) - l_idx head_logprob_i = paddle.gather(head_logprob, indices_i, axis=0) if i == 0: target_i_idx = paddle.concat([ paddle.arange(0, head_logprob_i.shape[0]).unsqueeze( [1]), target_i.unsqueeze([1]) ], axis=1) logprob_i = head_logprob_i.gather_nd(target_i_idx) else: weight_i, bias_i, proj_i = weights[i], biases[ i], self.out_projs[i].weight if self.out_projs[ i] is not None else None hidden_i = paddle.gather(hidden, indices_i, axis=0) tail_logit_i = self._compute_logits( hidden_i, weight_i, bias_i, proj_i) tail_logprob_i = paddle.log( F.softmax(tail_logit_i, axis=-1)) target_i_idx = paddle.concat([ paddle.arange(0, tail_logprob_i.shape[0]).unsqueeze( [1]), target_i.unsqueeze([1]) ], axis=1) logprob_i = tail_logprob_i.gather_nd(target_i_idx) logprob_i = head_logprob_i[:, -i] + logprob_i if self.keep_order or keep_order: nll = paddle.scatter(nll, indices_i, -logprob_i) else: index = paddle.arange(offset, offset + logprob_i.shape[0], 1) nll = paddle.scatter(nll, index, -logprob_i) offset += logprob_i.shape[0] return nll
def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): qlen, rlen, bsz = w.shape[1], r.shape[1], w.shape[0] if mems is not None: cat = paddle.concat([mems, w], axis=1) if self.normalize_before: w_heads = self.qkv_proj(self.layer_norm(cat)) else: w_heads = self.qkv_proj(cat) r_head_k = self.r_proj(r) w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads, chunks=3, axis=-1) w_head_q = w_head_q[:, -qlen:, :] else: if self.normalize_before: w_heads = self.qkv_proj(self.layer_norm(w)) else: w_heads = self.qkv_proj(w) r_head_k = self.r_proj(r) w_head_q, w_head_k, w_head_v = paddle.chunk(w_heads, chunks=3, axis=-1) klen = w_head_k.shape[1] w_head_q = paddle.reshape(w_head_q, shape=[bsz, qlen, self.n_head, self.d_head]) w_head_k = paddle.reshape(w_head_k, shape=[bsz, klen, self.n_head, self.d_head]) w_head_v = paddle.reshape(w_head_v, shape=[bsz, klen, self.n_head, self.d_head]) r_head_k = paddle.reshape(r_head_k, shape=[bsz, rlen, self.n_head, self.d_head]) rw_head_q = w_head_q + r_w_bias AC = einsum('bind,bjnd->bnij', rw_head_q, w_head_k) rr_head_q = w_head_q + r_r_bias BD = einsum('bind,bjnd->bnij', rr_head_q, r_head_k) BD = self._rel_shift(BD) attn_score = AC + BD attn_score = attn_score * self.scale if attn_mask is not None: attn_score = attn_score - 1e30 * attn_mask attn_prob = F.softmax(attn_score, axis=-1) attn_prob = self.attn_drop(attn_prob) attn_vec = einsum('bnij,bjnd->bind', attn_prob, w_head_v) attn_vec = paddle.reshape(attn_vec, shape=[ attn_vec.shape[0], attn_vec.shape[1], self.n_head * self.d_head ]) attn_out = self.o_proj(attn_vec) attn_out = self.drop(attn_out) if self.normalize_before: output = w + attn_out else: output = self.layer_norm(w + attn_out) return output
def do_train(): paddle.set_device(args.device) set_seed(args.seed) train_ds, dev_ds = load_dataset("chnsenticorp", splits=["train", "dev"]) model = FasterErnieForSequenceClassification.from_pretrained( 'ernie-1.0', num_classes=len(train_ds.label_list), max_seq_len=args.max_seq_length) train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size) dev_data_loader = create_dataloader( dev_ds, mode='dev', batch_size=args.batch_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() total_train_time = 0 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): texts, labels = batch["text"], batch["label"] texts = to_tensor(texts) with paddle.amp.auto_cast( args.use_amp, custom_white_list=["fused_feedforward", "fused_attention"]): logits, predictions = model(texts) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(logits, labels) metric.update(correct) acc = metric.accumulate() if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_step += 1 if global_step % args.logging_steps == 0: time_diff = time.time() - tic_train total_train_time += time_diff print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, args.logging_steps / time_diff)) tic_train = time.time() if global_step % args.save_steps == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model.save_pretrained(save_dir) tic_train = time.time() print("Speed: %.2f steps/s" % (global_step / total_train_time))
def run(self): args = self.args paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset(read, file_path=args.train_path, lazy=False) dev_ds = load_dataset(read, file_path=args.dev_path, lazy=False) model = paddlenlp.transformers.BertForSequenceClassification.from_pretrained( 'bert-base-chinese', num_class=2) tokenizer = paddlenlp.transformers.BertTokenizer.from_pretrained( 'bert-base-chinese') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) probs = functional.softmax(logits, axis=1) loss = criterion(probs, labels) correct = metric.compute(loss, labels) # labels可以是索引,也可以是独热表示 metric.update(correct) # 更新正确预测的个数以及总个数 acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and rank == 0: # 100步保存一次模型 save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) self.evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir)
def forward(self, input_ids, position_ids): if _global_parallel_strategy == "dp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(input_ids, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) input_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) if _global_parallel_strategy == "mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.word_embeddings.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) embeddings = input_embeddings + position_embeddings embeddings = self.dropout1(embeddings) # Pre-norm target = self.norm1(embeddings) # The following is the attention part q = self.q_proj(target) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(target) v = self.v_proj(target) if _global_parallel_strategy == "mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.q_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.k_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.v_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) if self.attn_mask is not None: product = product + self.attn_mask weights = F.softmax(product) if self.dropout_ratio: weights = F.dropout(weights, self.dropout_ratio, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) if _global_parallel_strategy == "mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.out_proj.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual residual = embeddings + self.dropout2(out) # Pre-norm out0 = self.norm2(residual) # The following is the MLP part out1 = self.linear0(out0) out2 = F.gelu(out1, approximate=True) out3 = self.linear1(out2) if _global_parallel_strategy == "mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 0] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [0, -1] }) elif _global_parallel_strategy == "dp_mp": auto.shard_tensor(self.linear0.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [-1, 1] }) auto.shard_tensor(self.linear1.weight, dist_attr={ "process_mesh": _global_process_mesh, "dims_mapping": [1, -1] }) # Add residual final = residual + self.dropout3(out3) return final
def evaluate(model, eval_dataset, aug_eval=False, scales=1.0, flip_horizontal=False, flip_vertical=False, is_slide=False, stride=None, crop_size=None, num_workers=0, print_detail=True, auc_roc=False): """ Launch evalution. Args: model(nn.Layer): A sementic segmentation model. eval_dataset (paddle.io.Dataset): Used to read and process validation datasets. aug_eval (bool, optional): Whether to use mulit-scales and flip augment for evaluation. Default: False. scales (list|float, optional): Scales for augment. It is valid when `aug_eval` is True. Default: 1.0. flip_horizontal (bool, optional): Whether to use flip horizontally augment. It is valid when `aug_eval` is True. Default: True. flip_vertical (bool, optional): Whether to use flip vertically augment. It is valid when `aug_eval` is True. Default: False. is_slide (bool, optional): Whether to evaluate by sliding window. Default: False. stride (tuple|list, optional): The stride of sliding window, the first is width and the second is height. It should be provided when `is_slide` is True. crop_size (tuple|list, optional): The crop size of sliding window, the first is width and the second is height. It should be provided when `is_slide` is True. num_workers (int, optional): Num workers for data loader. Default: 0. print_detail (bool, optional): Whether to print detailed information about the evaluation process. Default: True. auc_roc(bool, optional): whether add auc_roc metric Returns: float: The mIoU of validation datasets. float: The accuracy of validation datasets. """ model.eval() nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank if nranks > 1: # Initialize parallel environment if not done. if not paddle.distributed.parallel.parallel_helper._is_parallel_ctx_initialized( ): paddle.distributed.init_parallel_env() batch_sampler = paddle.io.DistributedBatchSampler( eval_dataset, batch_size=1, shuffle=False, drop_last=False) loader = paddle.io.DataLoader( eval_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, ) total_iters = len(loader) intersect_area_all = 0 pred_area_all = 0 label_area_all = 0 logits_all = None label_all = None if print_detail: logger.info( "Start evaluating (total_samples: {}, total_iters: {})...".format( len(eval_dataset), total_iters)) #TODO(chenguowei): fix log print error with multi-gpus progbar_val = progbar.Progbar( target=total_iters, verbose=1 if nranks < 2 else 2) reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() batch_start = time.time() with paddle.no_grad(): for iter, (im, label) in enumerate(loader): reader_cost_averager.record(time.time() - batch_start) label = label.astype('int64') ori_shape = label.shape[-2:] if aug_eval: pred, logits = infer.aug_inference( model, im, ori_shape=ori_shape, transforms=eval_dataset.transforms.transforms, scales=scales, flip_horizontal=flip_horizontal, flip_vertical=flip_vertical, is_slide=is_slide, stride=stride, crop_size=crop_size) else: pred, logits = infer.inference( model, im, ori_shape=ori_shape, transforms=eval_dataset.transforms.transforms, is_slide=is_slide, stride=stride, crop_size=crop_size) intersect_area, pred_area, label_area = metrics.calculate_area( pred, label, eval_dataset.num_classes, ignore_index=eval_dataset.ignore_index) # Gather from all ranks if nranks > 1: intersect_area_list = [] pred_area_list = [] label_area_list = [] paddle.distributed.all_gather(intersect_area_list, intersect_area) paddle.distributed.all_gather(pred_area_list, pred_area) paddle.distributed.all_gather(label_area_list, label_area) # Some image has been evaluated and should be eliminated in last iter if (iter + 1) * nranks > len(eval_dataset): valid = len(eval_dataset) - iter * nranks intersect_area_list = intersect_area_list[:valid] pred_area_list = pred_area_list[:valid] label_area_list = label_area_list[:valid] for i in range(len(intersect_area_list)): intersect_area_all = intersect_area_all + intersect_area_list[ i] pred_area_all = pred_area_all + pred_area_list[i] label_area_all = label_area_all + label_area_list[i] else: intersect_area_all = intersect_area_all + intersect_area pred_area_all = pred_area_all + pred_area label_area_all = label_area_all + label_area if auc_roc: logits = F.softmax(logits, axis=1) if logits_all is None: logits_all = logits.numpy() label_all = label.numpy() else: logits_all = np.concatenate( [logits_all, logits.numpy()]) # (KN, C, H, W) label_all = np.concatenate([label_all, label.numpy()]) batch_cost_averager.record( time.time() - batch_start, num_samples=len(label)) batch_cost = batch_cost_averager.get_average() reader_cost = reader_cost_averager.get_average() if local_rank == 0 and print_detail: progbar_val.update(iter + 1, [('batch_cost', batch_cost), ('reader cost', reader_cost)]) reader_cost_averager.reset() batch_cost_averager.reset() batch_start = time.time() class_iou, miou = metrics.mean_iou(intersect_area_all, pred_area_all, label_area_all) class_acc, acc = metrics.accuracy(intersect_area_all, pred_area_all) kappa = metrics.kappa(intersect_area_all, pred_area_all, label_area_all) class_dice, mdice = metrics.dice(intersect_area_all, pred_area_all, label_area_all) if auc_roc: auc_roc = metrics.auc_roc( logits_all, label_all, num_classes=eval_dataset.num_classes) auc_infor = ' Auc_roc: {:.4f}'.format(auc_roc) if print_detail: infor = "[EVAL] #Images: {} mIoU: {:.4f} Acc: {:.4f} Kappa: {:.4f} Dice: {:.4f}".format( len(eval_dataset), miou, acc, kappa, mdice) infor = infor + auc_infor if auc_roc else infor logger.info(infor) logger.info("[EVAL] Class IoU: \n" + str(np.round(class_iou, 4))) logger.info("[EVAL] Class Acc: \n" + str(np.round(class_acc, 4))) return miou, acc, class_iou, class_acc, kappa
def do_train(): set_seed(args.seed) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() train_ds, dev_ds, test_ds = load_dataset("chnsenticorp", splits=["train", "dev", "test"]) # If you wanna use bert/roberta/electra pretrained model, # model = ppnlp.transformers.BertForSequenceClassification.from_pretrained('bert-base-chinese', num_class=2) # model = ppnlp.transformers.RobertaForSequenceClassification.from_pretrained('roberta-wwm-ext', num_class=2) # model = ppnlp.transformers.ElectraForSequenceClassification.from_pretrained('chinese-electra-small', num_classes=2) model = ppnlp.transformers.ErnieForSequenceClassification.from_pretrained( 'ernie-tiny', num_classes=len(train_ds.label_list)) # If you wanna use bert/roberta/electra pretrained model, # tokenizer = ppnlp.transformers.BertTokenizer.from_pretrained('bert-base-chinese') # tokenizer = ppnlp.transformers.RobertaTokenizer.from_pretrained('roberta-wwm-ext') # tokenizer = ppnlp.transformers.ElectraTokenizer.from_pretrained('chinese-electra-small', num_classes=2) # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_ds, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and paddle.distributed.get_rank() == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % 100 == 0 and paddle.distributed.get_rank() == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if paddle.distributed.get_rank() == 0: print('Evaluating on test data.') evaluate(model, criterion, metric, test_data_loader)
def gaussian(self, theta_x, phi_x): pairwise_weight = paddle.matmul(theta_x, phi_x) pairwise_weight = F.softmax(pairwise_weight, axis=-1) return pairwise_weight
def forward(self, queries, keys, values, relation_k, relation_v, attn_bias=None, past_cache=None): """relational attention forward. seq_len in `shape` means num queries/keys/values of attention Args: queries (TYPE): shape = [batch, seq_len, num_heads * hidden] keys (TYPE): shape = queries.shape values (TYPE): shape = queries.shape relation_k (TYPE): shape = [batch, seq_len, seq_len, hidden] relation_v (TYPE): shape = relation_k.shape attn_bias (TYPE): used as sequence mask. Default is None past_cache (TYPE): Default is None Returns: TODO Raises: NULL """ assert len(queries.shape) == len(keys.shape) == len(values.shape) == 3 #bsz, q_len, q_dim = queries.shape #bsz, k_len, k_dim = keys.shape #bsz, v_len, v_dim = values.shape #assert k_len == v_len q = self.q(queries) k = self.k(keys) v = self.v(values) cache = (k, v) if past_cache is not None: cached_k, cached_v = past_cache k = paddle.concat([cached_k, k], 1) v = paddle.concat([cached_v, v], 1) def _transpose(inputs): """reshape and transpose Args: inputs: shape = [batch, seq_len, heads * hidden] Returns: shape = [batch, heads, seq_len, hidden] """ hidden_size = inputs.shape[-1] // self.n_head outputs = inputs.reshape([0, 0, self.n_head, hidden_size]) return outputs.transpose([0, 2, 1, 3]) q, k, v = [_transpose(x) for x in (q, k, v)] q = q.scale(self.d_key**-0.5) scores = relative_attention_logits(q, k, relation_k) if attn_bias is not None: scores += attn_bias scores = F.softmax(scores) scores = self.dropout(scores) out = relative_attention_values(scores, v, relation_v) # input: [batch, heads, seq_len, hidden] # output: [batch, seq_len, heads * hidden] out = out.transpose([0, 2, 1, 3]) out = out.reshape([0, 0, out.shape[2] * out.shape[3]]) out = self.o(out) return out, cache
def deep_match(item_his_eb, context_his_eb, mask, match_mask, mid_his_batch, EMBEDDING_DIM, item_vectors, item_biases, n_mid): query = context_his_eb query = self.query_layer(query) # [1, 50, 64] # print(f'query_prelu: {query.shape}') query = self.query_prelu(query) inputs = paddle.concat( [ query, item_his_eb, query - item_his_eb, query * item_his_eb ], axis=-1) # B,T,E att_layer1 = self.att_layer1_layer(inputs) att_layer1 = F.sigmoid(att_layer1) att_layer2 = self.att_layer2_layer(att_layer1) att_layer2 = F.sigmoid(att_layer2) att_layer3 = self.att_layer3_layer(att_layer2) # B,T,1 scores = paddle.transpose(att_layer3, [0, 2, 1]) # B,1,T # mask bool_mask = paddle.equal(mask, paddle.ones_like(mask)) # B,T key_masks = paddle.unsqueeze(bool_mask, axis=1) # B,1,T paddings = paddle.ones_like(scores) * (-2**32 + 1) scores = paddle.where(key_masks, scores, paddings) # tril scores_tile = paddle.tile( paddle.sum(scores, axis=1), [1, paddle.shape(scores)[-1]]) # B, T*T scores_tile = paddle.reshape(scores_tile, [ -1, paddle.shape(scores)[-1], paddle.shape(scores)[-1] ]) # B, T, T diag_vals = paddle.ones_like(scores_tile) # B, T, T tril = paddle.tril(diag_vals) paddings = paddle.ones_like(tril) * (-2**32 + 1) scores_tile = paddle.where( paddle.equal(tril, paddle.to_tensor(0.0)), paddings, scores_tile) # B, T, T scores_tile = F.softmax(scores_tile) # B, T, T att_dm_item_his_eb = paddle.matmul(scores_tile, item_his_eb) # B, T, E dnn_layer1 = self.dnn_layer1_layer(att_dm_item_his_eb) # print(f'dnn_layer1_prelu: {dnn_layer1.shape}') dnn_layer1 = self.dnn_layer1_prelu(dnn_layer1) # target mask user_vector = dnn_layer1[:, -1, :] # B, E user_vector2 = dnn_layer1[:, -2, :] * paddle.reshape( match_mask, [-1, paddle.shape(match_mask)[1], 1])[:, -2, :] # B, E num_sampled = 2000 labels = paddle.reshape(mid_his_batch[:, -1], [-1, 1]) # B, 1 # not sample # [B, E] * [E_size, cate_size] logits = paddle.matmul( user_vector2, item_vectors, transpose_y=True) logits = paddle.add(logits, item_biases) loss = F.cross_entropy(input=logits, label=labels) return loss, user_vector, scores
def forward(self, fpn_fms, rcnn_rois, labels=None, bbox_targets=None): # stride: 64,32,16,8,4 -> 4, 8, 16, 32 fpn_fms = fpn_fms[1:][::-1] stride = [4, 8, 16, 32] pool_features = roi_pooler(fpn_fms, rcnn_rois, stride, (7, 7), "ROIAlignV2") flatten_feature = torch.flatten(pool_features, start_axis=1) flatten_feature = F.relu_(self.fc1(flatten_feature)) flatten_feature = F.relu_(self.fc2(flatten_feature)) pred_emd_cls_0 = self.emd_pred_cls_0(flatten_feature) pred_emd_delta_0 = self.emd_pred_delta_0(flatten_feature) pred_emd_cls_1 = self.emd_pred_cls_1(flatten_feature) pred_emd_delta_1 = self.emd_pred_delta_1(flatten_feature) pred_emd_scores_0 = F.softmax(pred_emd_cls_0, axis=-1) pred_emd_scores_1 = F.softmax(pred_emd_cls_1, axis=-1) # cons refine feature boxes_feature_0 = cat( (pred_emd_delta_0[:, 4:], pred_emd_scores_0[:, 1][:, None]), axis=1).repeat(1, 4) boxes_feature_1 = cat( (pred_emd_delta_1[:, 4:], pred_emd_scores_1[:, 1][:, None]), axis=1).repeat(1, 4) boxes_feature_0 = cat((flatten_feature, boxes_feature_0), axis=1) boxes_feature_1 = cat((flatten_feature, boxes_feature_1), axis=1) refine_feature_0 = F.relu_(self.fc3(boxes_feature_0)) refine_feature_1 = F.relu_(self.fc3(boxes_feature_1)) # refine pred_ref_cls_0 = self.ref_pred_cls_0(refine_feature_0) pred_ref_delta_0 = self.ref_pred_delta_0(refine_feature_0) pred_ref_cls_1 = self.ref_pred_cls_1(refine_feature_1) pred_ref_delta_1 = self.ref_pred_delta_1(refine_feature_1) if self.training: loss0 = emd_loss_softmax(pred_emd_delta_0, pred_emd_cls_0, pred_emd_delta_1, pred_emd_cls_1, bbox_targets, labels) loss1 = emd_loss_softmax(pred_emd_delta_1, pred_emd_cls_1, pred_emd_delta_0, pred_emd_cls_0, bbox_targets, labels) loss2 = emd_loss_softmax(pred_ref_delta_0, pred_ref_cls_0, pred_ref_delta_1, pred_ref_cls_1, bbox_targets, labels) loss3 = emd_loss_softmax(pred_ref_delta_1, pred_ref_cls_1, pred_ref_delta_0, pred_ref_cls_0, bbox_targets, labels) loss_rcnn = cat([loss0, loss1], axis=1) loss_ref = cat([loss2, loss3], axis=1) # requires_grad = False _, min_indices_rcnn = loss_rcnn.min(dim=1) _, min_indices_ref = loss_ref.min(dim=1) loss_rcnn = loss_rcnn[torch.arange(loss_rcnn.shape[0]), min_indices_rcnn] loss_rcnn = loss_rcnn.mean() loss_ref = loss_ref[torch.arange(loss_ref.shape[0]), min_indices_ref] loss_ref = loss_ref.mean() loss_dict = {} loss_dict['loss_rcnn_emd'] = loss_rcnn loss_dict['loss_ref_emd'] = loss_ref return loss_dict else: class_num = pred_ref_cls_0.shape[-1] - 1 tag = torch.arange(class_num).type_as(pred_ref_cls_0) + 1 tag = tag.repeat(pred_ref_cls_0.shape[0], 1).reshape(-1, 1) pred_scores_0 = F.softmax(pred_ref_cls_0, axis=-1)[:, 1:].reshape(-1, 1) pred_scores_1 = F.softmax(pred_ref_cls_1, axis=-1)[:, 1:].reshape(-1, 1) pred_delta_0 = pred_ref_delta_0[:, 4:].reshape(-1, 4) pred_delta_1 = pred_ref_delta_1[:, 4:].reshape(-1, 4) base_rois = rcnn_rois[:, 1:5].repeat(1, class_num).reshape(-1, 4) pred_bbox_0 = restore_bbox(base_rois, pred_delta_0, True) pred_bbox_1 = restore_bbox(base_rois, pred_delta_1, True) pred_bbox_0 = cat([pred_bbox_0, pred_scores_0, tag], axis=1) pred_bbox_1 = cat([pred_bbox_1, pred_scores_1, tag], axis=1) pred_bbox = cat((pred_bbox_0, pred_bbox_1), axis=1) return pred_bbox
# cross-entropy cost function criterion = paddle.nn.loss.CrossEntropyLoss() # accuracy metric metric = paddle.metric.Accuracy() print(args) # train global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, pinyin_ids, labels = batch batch_size, length = input_ids.shape pinyin_ids = paddle.reshape(pinyin_ids, [batch_size, length, 8]) logits = model(input_ids, pinyin_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step()
def sample(self, input_ids, logits_processors, max_length, pad_token_id, eos_token_id, top_k=None, top_p=None, temperature=None, min_tokens_to_keep=1, **model_kwargs): def TopKProcess(probs, top_k, min_tokens_to_keep): top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1]) # Remove all tokens with a probability less than the last token of the top-k topk_probs, _ = paddle.topk(probs, k=top_k) probs = paddle.where(probs >= topk_probs[:, -1:], probs, paddle.full_like(probs, 0.0)) return probs def TopPProcess(probs, top_p, min_tokens_to_keep): sorted_probs = paddle.sort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) # Remove tokens with cumulative probs above the top_p, But keep at # least min_tokens_to_keep tokens sorted_indices_to_remove = cumulative_probs > top_p if min_tokens_to_keep > 1: # Set 'min_tokens_to_keep - 1' because the first token is kept sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0 # Keep the first token sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype='int64') sorted_indices_to_remove[:, 1:] = ( sorted_indices_to_remove[:, :-1].clone()) sorted_indices_to_remove[:, 0] = 0 # Scatter sorted tensors to original indexing sorted_indices = sorted_indices + paddle.arange( probs.shape[0]).unsqueeze(-1) * probs.shape[-1] condition = paddle.scatter(sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten()) condition = paddle.cast(condition, 'bool').reshape(probs.shape) probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) return probs batch_size, cur_len = input_ids.shape origin_len = cur_len unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool') scores = paddle.full([batch_size, 1], 0.0, dtype=paddle.get_default_dtype()) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation( input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # sample origin_probs = F.softmax(logits) origin_probs = paddle.log(origin_probs) if temperature is not None and temperature != 1.0: logits = logits / temperature probs = F.softmax(logits) if top_k is not None and top_k != 0: probs = TopKProcess(probs, top_k, min_tokens_to_keep) if top_p is not None and top_p < 1.0: probs = TopPProcess(probs, top_p, min_tokens_to_keep) next_tokens = paddle.multinomial(probs) next_scores = paddle.index_sample(origin_probs, next_tokens) if eos_token_id is not None: next_tokens = paddle.where( unfinished_flag, next_tokens, paddle.full_like(next_tokens, pad_token_id)) scores = self.update_scores_for_generation(scores, next_scores, cur_len - origin_len, unfinished_flag) cur_len += 1 input_ids = paddle.concat([input_ids, next_tokens], axis=1) if eos_token_id is not None: unfinished_flag = paddle.logical_and( unfinished_flag, next_tokens != eos_token_id) # Stop when there is a </s> in all sentences if not paddle.any(unfinished_flag): break model_kwargs = self.update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder) return input_ids[:, origin_len:], scores
parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable if __name__ == '__main__': paddle.set_device(args.device) feature_extractor = LogMelSpectrogram(sr=16000, n_fft=512, hop_length=320, n_mels=64, f_min=50) model = SoundClassifier(backbone=cnn14(pretrained=False, extract_embedding=True), num_class=len(ESC50.label_list)) model.set_state_dict(paddle.load(args.checkpoint)) model.eval() waveform, sr = load_audio(args.wav) feats = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) logits = model(feats) probs = F.softmax(logits, axis=1).numpy() sorted_indices = (-probs[0]).argsort() msg = f'[{args.wav}]\n' for idx in sorted_indices[:args.top_k]: msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n' print(msg)
def beam_search(self, input_ids, beam_scorer, logits_processors, max_length, pad_token_id, eos_token_id, **model_kwargs): batch_size = len(beam_scorer._beam_hyps) num_beams = beam_scorer.num_beams batch_beam_size, cur_len = input_ids.shape origin_len = cur_len assert ( num_beams * batch_size == batch_beam_size ), "Batch dimension of `input_ids` should be {}, but received {}.".format( num_beams * batch_size, batch_beam_size) beam_scores = paddle.zeros((batch_size, num_beams), dtype=paddle.get_default_dtype()) beam_scores[:, 1:] = -1e9 beam_scores = paddle.reshape(beam_scores, [-1]) while cur_len < max_length: # prepare model inputs & get model output model_inputs = self.prepare_inputs_for_generation( input_ids, **model_kwargs) outputs = self(**model_inputs) logits = outputs[0] if isinstance(outputs, tuple) else outputs # [batch_size, vocab_size] logits = logits[:, -1, :] # pre-process distribution logits = self.adjust_logits_during_generation(logits) logits = logits_processors(input_ids, logits) # beam search # [batch_size * num_beams, vocab_size] next_scores = F.softmax(logits) next_scores = paddle.log(next_scores) next_scores = next_scores + beam_scores.unsqueeze(-1) # reshape for beam search vocab_size = next_scores.shape[-1] next_scores = next_scores.reshape( [batch_size, num_beams * vocab_size]) next_scores, next_tokens = paddle.topk(next_scores, 2 * num_beams, axis=1) next_indices = next_tokens // vocab_size next_tokens = next_tokens % vocab_size # stateless beam_outputs = beam_scorer.process( input_ids, next_scores, next_tokens, next_indices, origin_len=origin_len, pad_token_id=pad_token_id, eos_token_id=eos_token_id, ) beam_scores = beam_outputs["next_beam_scores"] beam_next_tokens = beam_outputs["next_beam_tokens"] beam_idx = beam_outputs["next_beam_indices"] cur_len += 1 input_ids = paddle.concat([ paddle.index_select(input_ids, beam_idx), beam_next_tokens.unsqueeze(-1) ], axis=-1) if beam_scorer.is_done: break model_kwargs = self.update_model_kwargs_for_generation( outputs, model_kwargs, is_encoder_decoder=self.is_encoder_decoder) if model_kwargs["cache"] is not None: # reorder the cache model_kwargs["cache"] = map_structure( lambda x: paddle.index_select(x, beam_idx), model_kwargs["cache"]) pred_ids, scores = beam_scorer.finalize(input_ids, beam_scores, next_tokens, next_indices, pad_token_id=pad_token_id, eos_token_id=eos_token_id) return pred_ids[:, origin_len:], scores
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None, query_input_ids=None, query_token_type_ids=None, query_position_ids=None, query_attention_mask=None, title_input_ids=None, title_token_type_ids=None, title_position_ids=None, title_attention_mask=None, seq_lengths=None, labels=None): if self.task != 'text-matching': result = self.model(input_ids, token_type_ids, position_ids, attention_mask) else: query_result = self.model(query_input_ids, query_token_type_ids, query_position_ids, query_attention_mask) title_result = self.model(title_input_ids, title_token_type_ids, title_position_ids, title_attention_mask) if self.task == 'seq-cls': logits = result probs = F.softmax(logits, axis=1) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs elif self.task == 'token-cls': logits = result token_level_probs = F.softmax(logits, axis=-1) preds = token_level_probs.argmax(axis=-1) if labels is not None: loss = self.criterion(logits, labels.unsqueeze(-1)) num_infer_chunks, num_label_chunks, num_correct_chunks = \ self.metric.compute(None, seq_lengths, preds, labels) self.metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) _, _, f1_score = map(float, self.metric.accumulate()) return token_level_probs, loss, {'f1_score': f1_score} return token_level_probs elif self.task == 'text-matching': query_token_embedding, _ = query_result query_token_embedding = self.dropout(query_token_embedding) query_attention_mask = paddle.unsqueeze( (query_input_ids != self.model.pad_token_id).astype( self.model.pooler.dense.weight.dtype), axis=2) query_token_embedding = query_token_embedding * query_attention_mask query_sum_embedding = paddle.sum(query_token_embedding, axis=1) query_sum_mask = paddle.sum(query_attention_mask, axis=1) query_mean = query_sum_embedding / query_sum_mask title_token_embedding, _ = title_result title_token_embedding = self.dropout(title_token_embedding) title_attention_mask = paddle.unsqueeze( (title_input_ids != self.model.pad_token_id).astype( self.model.pooler.dense.weight.dtype), axis=2) title_token_embedding = title_token_embedding * title_attention_mask title_sum_embedding = paddle.sum(title_token_embedding, axis=1) title_sum_mask = paddle.sum(title_attention_mask, axis=1) title_mean = title_sum_embedding / title_sum_mask sub = paddle.abs(paddle.subtract(query_mean, title_mean)) projection = paddle.concat([query_mean, title_mean, sub], axis=-1) logits = self.classifier(projection) probs = F.softmax(logits) if labels is not None: loss = self.criterion(logits, labels) correct = self.metric.compute(probs, labels) acc = self.metric.update(correct) return probs, loss, {'acc': acc} return probs else: sequence_output, pooled_output = result return sequence_output, pooled_output
def GetBaselineOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) tensor_query = paddle.to_tensor(self.query, stop_gradient=False) cache_kv = None if self.has_cache_kv: cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None residual = tensor_query ln1_out = tensor_query if self.pre_layer_norm: ln1_out = self.norm1(tensor_query) q = self.q_proj(ln1_out) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(ln1_out) v = self.v_proj(ln1_out) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3]) if self.has_cache_kv: # [1, B, n_head, cache_seq_len, head_dim] cache_k, cache_v = paddle.split(cache_kv, 2) cache_k = paddle.squeeze(cache_k, axis=0) cache_v = paddle.squeeze(cache_v, axis=0) # [B, n_head, cache_seq_len + seq_len, head_dim] # out_seq_len = cache_seq_len + seq_len k_out = paddle.concat([cache_k, k_out], axis=-2) v_out = paddle.concat([cache_v, v_out], axis=-2) # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # --> [B, n_head, seq_len, out_seq_len] qk_out = layers.matmul(x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) attn_mask_out = qk_out + attn_mask softmax_out = F.softmax(attn_mask_out) else: softmax_out = F.softmax(qk_out) if self.dropout_prob: dropout_out = F.dropout(softmax_out, self.dropout_prob, training=self.training, mode="upscale_in_train") # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim] # --> [B, n_head, seq_len, head_dim] qktv_out = tensor.matmul(dropout_out, v_out) else: qktv_out = tensor.matmul(softmax_out, v_out) fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3]) out_linear_in = tensor.reshape( x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]]) out = self.out_proj(out_linear_in) residual_out = residual + self.dropout(out) if not self.pre_layer_norm: final_out = self.norm1(residual_out) else: final_out = residual_out if self.has_cache_kv: return final_out paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)], retain_graph=True) return final_out, tensor_query.grad
def search_mobilenetv2_block(config, args, image_size): image_shape = [3, image_size, image_size] transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) if args.data == 'cifar10': train_dataset = paddle.vision.datasets.Cifar10(mode='train', transform=transform, backend='cv2') val_dataset = paddle.vision.datasets.Cifar10(mode='test', transform=transform, backend='cv2') elif args.data == 'imagenet': train_dataset = imagenet_reader.ImageNetDataset(mode='train') val_dataset = imagenet_reader.ImageNetDataset(mode='val') places = static.cuda_places() if args.use_gpu else static.cpu_places() place = places[0] if args.is_server: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=True) else: sa_nas = SANAS(config, server_addr=(args.server_address, args.port), search_steps=args.search_steps, is_server=False) for step in range(args.search_steps): archs = sa_nas.next_archs()[0] train_program = static.Program() test_program = static.Program() startup_program = static.Program() with static.program_guard(train_program, startup_program): data_shape = [None] + image_shape data = static.data(name='data', shape=data_shape, dtype='float32') label = static.data(name='label', shape=[None, 1], dtype='int64') if args.data == 'cifar10': paddle.assign(paddle.reshape(label, [-1, 1]), label) train_loader = paddle.io.DataLoader(train_dataset, places=places, feed_list=[data, label], drop_last=True, batch_size=args.batch_size, return_list=False, shuffle=True, use_shared_memory=True, num_workers=4) val_loader = paddle.io.DataLoader(val_dataset, places=place, feed_list=[data, label], drop_last=False, batch_size=args.batch_size, return_list=False, shuffle=False) data = conv_bn_layer(input=data, num_filters=32, filter_size=3, stride=2, padding='SAME', act='relu6', name='mobilenetv2_conv1') data = archs(data)[0] data = conv_bn_layer(input=data, num_filters=1280, filter_size=1, stride=1, padding='SAME', act='relu6', name='mobilenetv2_last_conv') data = F.adaptive_avg_pool2d(data, output_size=[1, 1], name='mobilenetv2_last_pool') output = static.nn.fc( x=data, size=args.class_dim, weight_attr=ParamAttr(name='mobilenetv2_fc_weights'), bias_attr=ParamAttr(name='mobilenetv2_fc_offset')) softmax_out = F.softmax(output) cost = F.cross_entropy(softmax_out, label=label) avg_cost = paddle.mean(cost) acc_top1 = paddle.metric.accuracy(input=softmax_out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=softmax_out, label=label, k=5) test_program = train_program.clone(for_test=True) optimizer = paddle.optimizer.Momentum( learning_rate=0.1, momentum=0.9, weight_decay=paddle.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) current_flops = flops(train_program) print('step: {}, current_flops: {}'.format(step, current_flops)) if current_flops > int(321208544): continue exe = static.Executor(place) exe.run(startup_program) build_strategy = static.BuildStrategy() train_compiled_program = static.CompiledProgram( train_program).with_data_parallel(loss_name=avg_cost.name, build_strategy=build_strategy) for epoch_id in range(args.retain_epoch): for batch_id, data in enumerate(train_loader()): fetches = [avg_cost.name] s_time = time.time() outs = exe.run(train_compiled_program, feed=data, fetch_list=fetches)[0] batch_time = time.time() - s_time if batch_id % 10 == 0: _logger.info( 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms' .format(step, epoch_id, batch_id, outs[0], batch_time)) reward = [] for batch_id, data in enumerate(val_loader()): test_fetches = [avg_cost.name, acc_top1.name, acc_top5.name] batch_reward = exe.run(test_program, feed=data, fetch_list=test_fetches) reward_avg = np.mean(np.array(batch_reward), axis=1) reward.append(reward_avg) _logger.info( 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}' .format(step, batch_id, batch_reward[0], batch_reward[1], batch_reward[2])) finally_reward = np.mean(np.array(reward), axis=0) _logger.info( 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( finally_reward[0], finally_reward[1], finally_reward[2])) sa_nas.reward(float(finally_reward[1]))
def forward(self, logits, label): """ Forward computation. Args: logits (tuple|list): (seg_logit, edge_logit) Tensor, the data type is float32, float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. C =1 of edge_logit . label (Tensor): Label tensor, the data type is int64. Shape is (N, C), where each value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1. """ seg_logit, edge_logit = logits[0], logits[1] if len(label.shape) != len(seg_logit.shape): label = paddle.unsqueeze(label, 1) if edge_logit.shape != label.shape: raise ValueError( 'The shape of edge_logit should equal to the label, but they are {} != {}' .format(edge_logit.shape, label.shape)) # Filter out edge filler = paddle.ones_like(label) * self.ignore_index label = paddle.where(edge_logit > self.edge_threshold, label, filler) # ohem n, c, h, w = seg_logit.shape label = label.reshape((-1, )) valid_mask = (label != self.ignore_index).astype('int64') num_valid = valid_mask.sum() label = label * valid_mask prob = F.softmax(seg_logit, axis=1) prob = prob.transpose((1, 0, 2, 3)).reshape((c, -1)) if self.min_kept < num_valid and num_valid > 0: # let the value which ignored greater than 1 prob = prob + (1 - valid_mask) # get the prob of relevant label label_onehot = F.one_hot(label, c) label_onehot = label_onehot.transpose((1, 0)) prob = prob * label_onehot prob = paddle.sum(prob, axis=0) threshold = self.thresh if self.min_kept > 0: index = prob.argsort() threshold_index = index[min(len(index), self.min_kept) - 1] threshold_index = int(threshold_index.numpy()[0]) if prob[threshold_index] > self.thresh: threshold = prob[threshold_index] kept_mask = (prob < threshold).astype('int64') label = label * kept_mask valid_mask = valid_mask * kept_mask # make the invalid region as ignore label = label + (1 - valid_mask) * self.ignore_index label = label.reshape((n, 1, h, w)) valid_mask = valid_mask.reshape((n, 1, h, w)).astype('float32') loss = F.softmax_with_cross_entropy( seg_logit, label, ignore_index=self.ignore_index, axis=1) loss = loss * valid_mask avg_loss = paddle.mean(loss) / (paddle.mean(valid_mask) + self.EPS) label.stop_gradient = True valid_mask.stop_gradient = True return avg_loss
def soft_cross_entropy(inp, target): inp_likelihood = F.log_softmax(inp, axis=-1) target_prob = F.softmax(target, axis=-1) return -1. * paddle.mean(paddle.sum(inp_likelihood * target_prob, axis=-1))
def do_train(args): set_seed(args) paddle.set_device("gpu" if args.n_gpu else "cpu") world_size = paddle.distributed.get_world_size() if world_size > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] train_dataset, dev_dataset, test_dataset = ppnlp.datasets.ChnSentiCorp.get_datasets( ['train', 'dev', 'test']) if args.model_name == 'ernie-tiny': # ErnieTinyTokenizer is special for ernie-tiny pretained model. tokenizer = ppnlp.transformers.ErnieTinyTokenizer.from_pretrained( args.model_name) else: tokenizer = tokenizer_class.from_pretrained(args.model_name) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_dataset.get_labels(), max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_dataset, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_dataset, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) test_data_loader = create_dataloader(test_dataset, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = model_class.from_pretrained(args.model_name, num_classes=len( train_dataset.get_labels())) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs num_warmup_steps = int(args.warmup_proption * num_training_steps) def get_lr_factor(current_step): if current_step < num_warmup_steps: return float(current_step) / float(max(1, num_warmup_steps)) else: return max( 0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lr_lambda=lambda current_step: get_lr_factor(current_step)) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() global_step += 1 if global_step % 10 == 0 and paddle.distributed.get_rank() == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % 100 == 0 and paddle.distributed.get_rank() == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) evaluate(model, criterion, metric, dev_data_loader) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) if paddle.distributed.get_rank() == 0: print('Evaluating on test data.') evaluate(model, criterion, metric, test_data_loader)