def __init__( self, model_name_or_path: Union[str, Path], top_k: int = 10, use_gpu: bool = True, ): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. 'rocketqa-zh-dureader-cross-encoder'. :param top_k: The maximum number of documents to return :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. """ # save init parameters to enable export of component config as YAML self.set_config( model_name_or_path=model_name_or_path, top_k=top_k, ) self.top_k = top_k self.devices, _ = initialize_device_settings(use_cuda=use_gpu, multi_gpu=True) self.transformer_model = ErnieCrossEncoder(model_name_or_path) self.tokenizer = ErnieTokenizer.from_pretrained(model_name_or_path) self.transformer_model.eval() if len(self.devices) > 1: self.model = paddle.DataParallel(self.transformer_model)
def __init__(self): """ initialize with the necessary elements """ self.tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") self.rev_dict = self.tokenizer.vocab.idx_to_token self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i]) self._model = None
def do_predict(): paddle.set_device(args.device) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) no_entity_label = "O" ignore_label = len(label_map) print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"].replace(" ", "\002") input_ids, token_type_ids, seq_len = convert_example_to_feature([list(sent), []], tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids, seq_len)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input_ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token_type_ids Stack(dtype='int64') # sequence lens ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [encoded_inputs_list[i: i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size)] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids, seq_lens = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=-1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for p_list, p_ids, seq_len in zip(probs.tolist(), probs_ids.tolist(), seq_lens.tolist()): prob_one = [p_list[index][pid] for index, pid in enumerate(p_ids[1: seq_len - 1])] label_one = [id2label[pid] for pid in p_ids[1: seq_len - 1]] results.append({"probs": prob_one, "labels": label_one}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def do_predict(args): place = paddle.set_device("gpu") paddle.seed(args.seed) tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, pad_to_max_seq_len=args.pad_to_max_seq_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tilte_segment ): [data for data in fn(samples)] valid_ds = load_dataset(read_text_pair, data_path=args.text_pair_file, lazy=False) valid_data_loader = create_dataloader(valid_ds, mode="predict", batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) pretrained_model = ErnieModel.from_pretrained("ernie-1.0") model = SemanticIndexingPredictor(pretrained_model, args.output_emb_size, dropout=args.dropout, use_fp16=args.use_fp16) model.eval() model.load(args.params_path) model = enable_faster_encoder(model, use_fp16=args.use_fp16) cosine_sims = [] for batch_data in valid_data_loader: query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch_data query_input_ids = paddle.to_tensor(query_input_ids) query_token_type_ids = paddle.to_tensor(query_token_type_ids) title_input_ids = paddle.to_tensor(title_input_ids) title_token_type_ids = paddle.to_tensor(title_token_type_ids) batch_cosine_sim = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids).numpy() cosine_sims.append(batch_cosine_sim) cosine_sims = np.concatenate(cosine_sims, axis=0) for cosine in cosine_sims: print('{}'.format(cosine)) model = disable_faster_encoder(model)
def __init__(self): """ initialize with the necessary elements """ assets_path = os.path.join(self.directory, "assets") gen_checkpoint_path = os.path.join(assets_path, "ernie_gen_lover_words.pdparams") self.model = ErnieForGeneration.from_pretrained("ernie-1.0") model_state = paddle.load(gen_checkpoint_path) self.model.set_dict(model_state) self.tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") self.rev_dict = self.tokenizer.vocab.idx_to_token self.rev_dict[self.tokenizer.vocab['[PAD]']] = '' # replace [PAD] self.rev_dict[self.tokenizer.vocab['[UNK]']] = '' # replace [PAD] self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i])
def do_predict(): paddle.set_device(args.device) # Reads label_map. label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit("{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) num_classes = (len(label_map.keys()) - 2) * 2 + 2 # Loads pretrained model ERNIE model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=num_classes) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") criterion = BCELossForDuIE() # Loads dataset. test_dataset = DuIEDataset.from_file(args.predict_data_file, tokenizer, args.max_seq_length, True) collator = DataCollator() test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Loads model parameters. if not (os.path.exists(args.init_checkpoint) and os.path.isfile(args.init_checkpoint)): sys.exit("wrong directory: init checkpoints {} not exist".format( args.init_checkpoint)) state_dict = paddle.load(args.init_checkpoint) model.set_dict(state_dict) # Does predictions. print("\n=====start predicting=====") evaluate(model, criterion, test_data_loader, args.predict_data_file, "predict") print("=====predicting complete=====")
def __init__(self, line=4, word=7): """ initialize with the necessary elements """ if line not in [4, 8]: raise ValueError("The line could only be 4 or 8.") if word not in [5, 7]: raise ValueError("The word could only be 5 or 7.") self.line = line assets_path = os.path.join(self.directory, "assets") gen_checkpoint_path = os.path.join(assets_path, "ernie_gen_acrostic_poetry_L%sW%s.pdparams" % (line, word)) self.model = ErnieForGeneration.from_pretrained("ernie-1.0") model_state = paddle.load(gen_checkpoint_path) self.model.set_dict(model_state) self.tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") self.rev_dict = self.tokenizer.vocab.idx_to_token self.rev_dict[self.tokenizer.vocab['[PAD]']] = '' # replace [PAD] self.rev_dict[self.tokenizer.vocab['[UNK]']] = '' # replace [PAD] self.rev_lookup = np.vectorize(lambda i: self.rev_dict[i])
def __init__(self, model_config): super(ErnieInputEncoderV2, self).__init__() self.config = model_config self.enc_value_with_col = model_config.enc_value_with_col if model_config.pretrain_model_type == 'BERT': self.tokenizer = BertTokenizer.from_pretrained( model_config.pretrain_model) self.special_token_dict = { 'table': '[unused1]', 'column': '[unused2]', 'value': '[unused3]', 'text': '[unused11]', 'real': '[unused12]', 'number': '[unused13]', 'time': '[unused14]', 'binary': '[unused15]', 'boolean': '[unused16]', 'bool': '[unused17]', 'others': '[unused18]', } else: self.tokenizer = ErnieTokenizer.from_pretrained( model_config.pretrain_model) # low frequency token will be used as specail token # Other candidate: overchicstoretvhome self.special_token_dict = { 'table': 'blogabstract', 'column': 'wx17house', 'value': 'fluke62max', 'text': 'googlemsn', 'real': 'sputniknews', 'number': 'sputniknews', 'time': 'pixstyleme3c', 'binary': 'pixnetfacebookyahoo', 'boolean': 'pixnetfacebookyahoo', 'bool': 'pixnetfacebookyahoo', 'others': 'ubuntuforumwikilinuxpastechat', } self._need_bool_value = True if self.config.grammar_type != 'nl2sql' else False
def prepare(self): import paddle import paddlenlp from paddlenlp.data import Stack, Tuple, Pad from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer from tutorials.assets.utils import convert_example, aggregate_subwords_and_importances MODEL_NAME = "ernie-2.0-base-en" model = ErnieForSequenceClassification.from_pretrained(MODEL_NAME, num_classes=2) tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME) self.paddle_model = model self.tokenizer = tokenizer def preprocess_fn(data): examples = [] if not isinstance(data, list): data = [data] for text in data: input_ids, segment_ids = convert_example(text, tokenizer, max_seq_length=128, is_test=True) examples.append((input_ids, segment_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input id Pad(axis=0, pad_val=tokenizer.pad_token_id), # segment id ): fn(samples) input_ids, segment_ids = batchify_fn(examples) return paddle.to_tensor(input_ids, stop_gradient=False), paddle.to_tensor( segment_ids, stop_gradient=False) self.preprocess_fn = preprocess_fn
def do_train(): paddle.set_device(args.device) world_size = paddle.distributed.get_world_size() rank = paddle.distributed.get_rank() if world_size > 1: paddle.distributed.init_parallel_env() set_seed(args) no_entity_label = "O" ignore_label = -1 tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForTokenClassification.from_pretrained("ernie-1.0", num_classes=len(label_map)) model = paddle.DataParallel(model) print("============start train==========") train_ds = DuEventExtraction(args.train_data, args.tag_path) dev_ds = DuEventExtraction(args.dev_data, args.tag_path) test_ds = DuEventExtraction(args.test_data, args.tag_path) trans_func = partial( convert_example_to_feature, tokenizer=tokenizer, label_vocab=train_ds.label_vocab, max_seq_len=args.max_seq_len, no_entity_label=no_entity_label, ignore_label=ignore_label, is_test=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # input ids Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), # token type ids Stack(dtype='int64'), # sequence lens Pad(axis=0, pad_val=ignore_label, dtype='int64') # labels ): fn(list(map(trans_func, samples))) batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) train_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader( dataset=dev_ds, batch_size=args.batch_size, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader( dataset=test_ds, batch_size=args.batch_size, collate_fn=batchify_fn) num_training_steps = len(train_loader) * args.num_epoch # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) metric = ChunkEvaluator(label_list=train_ds.label_vocab.keys(), suffix=False) criterion = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) step, best_f1 = 0, 0.0 model.train() for epoch in range(args.num_epoch): for idx, (input_ids, token_type_ids, seq_lens, labels) in enumerate(train_loader): logits = model(input_ids, token_type_ids).reshape( [-1, train_ds.label_num]) loss = paddle.mean(criterion(logits, labels.reshape([-1]))) loss.backward() optimizer.step() optimizer.clear_grad() loss_item = loss.numpy().item() if step > 0 and step % args.skip_step == 0 and rank == 0: print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) - loss: {loss_item:.6f}') if step > 0 and step % args.valid_step == 0 and rank == 0: p, r, f1, avg_loss = evaluate(model, criterion, metric, len(label_map), dev_loader) print(f'dev step: {step} - loss: {avg_loss:.5f}, precision: {p:.5f}, recall: {r:.5f}, ' \ f'f1: {f1:.5f} current best {best_f1:.5f}') if f1 > best_f1: best_f1 = f1 print(f'==============================================save best model ' \ f'best performerence {best_f1:5f}') paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints)) step += 1 # save the final model if rank == 0: paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
""" def __call__(self, examples: List[Dict[str, Union[list, np.ndarray]]]): batched_input_ids = np.stack([x['input_ids'] for x in examples]) seq_lens = np.stack([x['seq_lens'] for x in examples]) tok_to_orig_start_index = np.stack( [x['tok_to_orig_start_index'] for x in examples]) tok_to_orig_end_index = np.stack( [x['tok_to_orig_end_index'] for x in examples]) labels = np.stack([x['labels'] for x in examples]) return (batched_input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels) if __name__ == "__main__": tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") d = DuIEDataset.from_file("./data/train_data.json", tokenizer) sampler = paddle.io.RandomSampler(data_source=d) batch_sampler = paddle.io.BatchSampler(sampler=sampler, batch_size=2) collator = DataCollator() loader = paddle.io.DataLoader(dataset=d, batch_sampler=batch_sampler, collate_fn=collator, return_list=True) for dd in loader(): model_input = { "input_ids": dd[0], "seq_len": dd[1], "tok_to_orig_start_index": dd[2], "tok_to_orig_end_index": dd[3],
def train(): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = Poetry.get_datasets(['train', 'dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.apply(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_sids, src_pids, tgt_ids, tgt_sids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() _, __, info = model(src_ids, sent_ids=src_sids, pos_ids=src_pids, attn_bias=mask_src_2_src, encode_only=True) cached_k, cached_v = info['caches'] _, __, info = model(tgt_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_tgt_2_srctgt, past_cache=(cached_k, cached_v), encode_only=True) cached_k2, cached_v2 = info['caches'] past_cache_k = [ paddle.concat([k, k2], 1) for k, k2 in zip(cached_k, cached_k2) ] past_cache_v = [ paddle.concat([v, v2], 1) for v, v2 in zip(cached_v, cached_v2) ] if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) loss, _, __ = model(attn_ids, sent_ids=tgt_sids, pos_ids=tgt_pids, attn_bias=mask_attn_2_srctgtattn, past_cache=(past_cache_k, past_cache_v), tgt_labels=tgt_labels, tgt_pos=paddle.nonzero(attn_ids == attn_id)) if global_step % args.logging_steps == 0: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 and ( (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0): evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
sentences = [example[0] for example in ds.data] results = parse_decodes(sentences, all_preds, all_lens, label_vocab) return results if __name__ == '__main__': paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, dev_ds, test_ds = load_dataset( datafiles=(os.path.join(args.data_dir, 'train.txt'), os.path.join(args.data_dir, 'dev.txt'), os.path.join(args.data_dir, 'test.txt'))) label_vocab = load_dict(os.path.join(args.data_dir, 'tag.dic')) tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_to_features, tokenizer=tokenizer, label_vocab=label_vocab) train_ds.map(trans_func) dev_ds.map(trans_func) test_ds.map(trans_func) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32' ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # token_type_ids Stack(dtype='int64'), # seq_len
def predict(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences = [] evaluated_sentences_ids = [] logger.info("Predicting...") for data in data_loader: (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for source_ids, target_ids, predict_ids in zip( src_ids.numpy().tolist(), raw_tgt_labels.numpy().tolist(), output_ids.tolist()): if eos_id in predict_ids: predict_ids = predict_ids[:predict_ids.index(eos_id)] source_sentence = ''.join( map(post_process, vocab.to_tokens(source_ids[1:source_ids.index(eos_id)]))) tgt_sentence = ''.join( map(post_process, vocab.to_tokens(target_ids[1:target_ids.index(eos_id)]))) predict_ids = ''.join( map(post_process, vocab.to_tokens(predict_ids))) print("source :%s\ntarget :%s\npredict:%s\n" % (source_sentence, tgt_sentence, predict_ids))
def train(): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) train_dataset, dev_dataset = load_dataset( 'poetry', splits=('train', 'dev'), lazy=False) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example( tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len, noise_prob=args.noise_prob, use_random_noice=args.use_random_noice) train_dataset = train_dataset.map(trans_func) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # src_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # tgt_tids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) train_data_loader = DataLoader( dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_dataset = dev_dataset.map(trans_func) dev_data_loader = DataLoader( dataset=dev_dataset, batch_size=args.batch_size, collate_fn=batchify_fn, num_workers=0, return_list=True) label_num = model.word_emb.weight.shape[0] train_model = StackModel(model) if paddle.distributed.get_world_size() > 1: # All 'forward' outputs derived from the module parameters using in DataParallel # must participate in the calculation of losses and subsequent gradient calculations. # So we use StackModel here to make the model only output loss in its 'forward' function. train_model = paddle.DataParallel(train_model) max_steps = len(train_data_loader) * args.num_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=nn.ClipGradByGlobalNorm(1.0), apply_decay_param_fun=lambda x: x in decay_params) rouge1 = Rouge1() rouge2 = Rouge2() global_step = 1 tic_train = time.time() for epoch in range(args.num_epochs): for step, batch in enumerate(train_data_loader, start=1): (src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, _) = batch # import pdb; pdb.set_trace() if args.label_smooth > 0.: tgt_labels = nn.functional.label_smooth( nn.functional.one_hot(tgt_labels, label_num), epsilon=args.label_smooth) tgt_pos = paddle.nonzero(attn_ids == attn_id) loss = train_model(src_ids, src_tids, src_pids, tgt_ids, tgt_tids, tgt_pids, attn_ids, mask_src_2_src, mask_tgt_2_srctgt, mask_attn_2_srctgtattn, tgt_labels, tgt_pos) if global_step % args.logging_steps == 0: if paddle.distributed.get_rank() == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, lr: %.3e" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train), lr_scheduler.get_lr())) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and paddle.distributed.get_rank( ) == 0: evaluate(model, dev_data_loader, tokenizer, rouge1, rouge2, attn_id, tgt_type_id, args) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) global_step += 1
def evaluate(): paddle.set_device("gpu" if args.use_gpu else "cpu") model = ErnieForGeneration.from_pretrained(args.model_name_or_path) if "ernie-tiny" in args.model_name_or_path: tokenizer = ErnieTinyTokenizer.from_pretrained(args.model_name_or_path) elif "ernie" in args.model_name_or_path: tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) elif "roberta" in args.model_name_or_path or "rbt" in args.model_name_or_path: tokenizer = RobertaTokenizer.from_pretrained(args.model_name_or_path) elif "electra" in args.model_name_or_path: tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path) else: tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) dev_dataset = Poetry.get_datasets(['dev']) attn_id = tokenizer.vocab[ '[ATTN]'] if '[ATTN]' in tokenizer.vocab else tokenizer.vocab['[MASK]'] tgt_type_id = model.sent_emb.weight.shape[0] - 1 trans_func = convert_example(tokenizer=tokenizer, attn_id=attn_id, tgt_type_id=tgt_type_id, max_encode_len=args.max_encode_len, max_decode_len=args.max_decode_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # src_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_pids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_sids Pad(axis=0, pad_val=tokenizer.pad_token_id), # attn_ids Pad(axis=0, pad_val=tokenizer.pad_token_id), # tgt_labels ): after_padding(fn(samples)) dev_dataset = dev_dataset.apply(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_dataset, batch_size=args.batch_size, shuffle=False) data_loader = DataLoader(dataset=dev_dataset, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) rouge1 = Rouge1() rouge2 = Rouge2() if args.init_checkpoint: model_state = paddle.load(args.init_checkpoint) model.set_state_dict(model_state) model.eval() vocab = tokenizer.vocab eos_id = vocab[tokenizer.sep_token] sos_id = vocab[tokenizer.cls_token] pad_id = vocab[tokenizer.pad_token] unk_id = vocab[tokenizer.unk_token] vocab_size = len(vocab) evaluated_sentences_ids = [] reference_sentences_ids = [] logger.info("Evaluating...") for data in tqdm(data_loader): (src_ids, src_sids, src_pids, _, _, _, _, _, _, _, _, raw_tgt_labels) = data # never use target when infer # Use greedy_search_infilling or beam_search_infilling to get predictions output_ids = beam_search_infilling(model, src_ids, src_sids, eos_id=eos_id, sos_id=sos_id, attn_id=attn_id, pad_id=pad_id, unk_id=unk_id, vocab_size=vocab_size, max_decode_len=args.max_decode_len, max_encode_len=args.max_encode_len, beam_width=args.beam_width, length_penalty=args.length_penalty, tgt_type_id=tgt_type_id) for ids in output_ids.tolist(): if eos_id in ids: ids = ids[:ids.index(eos_id)] evaluated_sentences_ids.append(ids) for ids in raw_tgt_labels.numpy().tolist(): ids = ids[:ids.index(eos_id)] reference_sentences_ids.append(ids) score1 = rouge1.score(evaluated_sentences_ids, reference_sentences_ids) score2 = rouge2.score(evaluated_sentences_ids, reference_sentences_ids) logger.info("Rouge-1: %.5f ,Rouge-2: %.5f" % (score1 * 100, score2 * 100))
def do_predict(args): paddle.set_device(args.device) pinyin_vocab = Vocab.load_vocabulary(args.pinyin_vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) ernie = ErnieModel.from_pretrained(args.model_name_or_path) model = ErnieForCSC(ernie, pinyin_vocab_size=len(pinyin_vocab), pad_pinyin_id=pinyin_vocab[pinyin_vocab.pad_token]) eval_ds = load_dataset(read_test_ds, data_path=args.test_file, lazy=False) trans_func = partial(convert_example, tokenizer=tokenizer, pinyin_vocab=pinyin_vocab, max_seq_length=args.max_seq_length, is_test=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64' ), # segment Pad(axis=0, pad_val=pinyin_vocab.token_to_idx[pinyin_vocab.pad_token], dtype='int64'), # pinyin Stack(axis=0, dtype='int64'), # length ): [data for data in fn(samples)] test_data_loader = create_dataloader(eval_ds, mode='test', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.ckpt_path: model_dict = paddle.load(args.ckpt_path) model.set_dict(model_dict) logger.info("Load model from checkpoints: {}".format(args.ckpt_path)) model.eval() corr_preds = [] det_preds = [] lengths = [] for step, batch in enumerate(test_data_loader): input_ids, token_type_ids, pinyin_ids, length = batch det_error_probs, corr_logits = model(input_ids, pinyin_ids, token_type_ids) # corr_logits shape: [B, T, V] det_pred = det_error_probs.argmax(axis=-1) det_pred = det_pred.numpy() char_preds = corr_logits.argmax(axis=-1) char_preds = char_preds.numpy() length = length.numpy() corr_preds += [pred for pred in char_preds] det_preds += [prob for prob in det_pred] lengths += [l for l in length] write_sighan_result_to_file(args, corr_preds, det_preds, lengths, tokenizer)
token_example[ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 token_example["end_positions"] = token_end_index + 1 token_example["answerable_label"] = 1 return tokenized_examples if __name__ == "__main__": from paddlenlp.transformers import ErnieTokenizer data_path = "./dataset/dev.json" pretrain_model_path = "./finetuned_model" tokenizer = ErnieTokenizer.from_pretrained(pretrain_model_path) a = tokenizer(["今天天气不错", "酿豆腐按到法"], ["如何进行", "你在吗"], stride=5, max_seq_len=10) da = DataHelper(tokenizer=tokenizer, batch_size=2, doc_stride=128, max_seq_length=512) data_loader = da.get_iterator(data_path, part_feature=False) samples_label = next(iter(data_loader)) print(len(samples_label)) data_loader = da.get_iterator(data_path, part_feature=True) samples_no_label = next(iter(data_loader)) print(len(samples_no_label))
if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles] train_ds, dev_ds, test_ds = load_dataset(datafiles=('./waybill_data/train.txt', './waybill_data/dev.txt', './waybill_data/test.txt')) label_vocab = load_dict('./conf/tag.dic') # 设置想要使用模型的名称 MODEL_NAME = "ernie-1.0" tokenizer = ErnieTokenizer.from_pretrained(MODEL_NAME) trans_func = partial(convert_example, tokenizer=tokenizer, label_vocab=label_vocab) train_ds.map(trans_func) dev_ds.map(trans_func) test_ds.map(trans_func) ignore_label = -1 batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # token_type_ids Stack(), # seq_len Pad(axis=0, pad_val=ignore_label) # labels ): fn(samples)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--input_file", default=None, type=str, required=True, help="The input train corpus. can be directory with .txt files or a path to a single file" ) parser.add_argument( "--output_file", default=None, type=str, required=True, help="The output file where the pretraining data will be written.") parser.add_argument( "--model_name", choices=[ 'bert-base-uncased', 'bert-base-chinese', 'bert-wwm-chinese', 'ernie-1.0' ], default="bert-base-chinese", required=True, help="Select which model to pretrain, defaults to bert-base-chinese.") parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_word_length", default=4, type=int, help="The maximum total chinese character length in a word.") parser.add_argument( "--dupe_factor", default=10, type=int, help="Number of times to duplicate the input data (with different masks)." ) parser.add_argument( "--max_predictions_per_seq", default=20, type=int, help="Maximum sequence length.") parser.add_argument( "--masked_lm_prob", default=0.15, type=float, help="Masked LM probability.") parser.add_argument( "--short_seq_prob", default=0.1, type=float, help="Probability to create a sequence shorter than maximum sequence length" ) parser.add_argument( "--do_lower_case", action='store_true', default=True, help="Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( '--random_seed', type=int, default=10000, help="random seed for initialization") parser.add_argument( '--check', action='store_true', default=False, help="Whether to check the pretraining data creation.") args = parser.parse_args() if args.model_name.startswith("bert"): tokenizer = BertTokenizer.from_pretrained( args.model_name, do_lower_case=args.do_lower_case) elif args.model_name.startswith("ernie"): tokenizer = ErnieTokenizer.from_pretrained(args.model_name) input_files = [] if os.path.isfile(args.input_file): input_files.append(args.input_file) elif os.path.isdir(args.input_file): input_files = [ os.path.join(args.input_file, f) for f in os.listdir(args.input_file) if (os.path.isfile(os.path.join(args.input_file, f)) and f.endswith( '.txt')) ] else: raise ValueError("{} is not a valid path".format(args.input_file)) rng = random.Random(args.random_seed) instances = create_training_instances( input_files, tokenizer, args.model_name, args.max_seq_length, args.max_word_length, args.dupe_factor, args.short_seq_prob, args.masked_lm_prob, args.max_predictions_per_seq, rng) write_instance_to_example_file(instances, tokenizer, args.max_seq_length, args.max_predictions_per_seq, args.output_file)
def do_train(): paddle.set_device(args.device) world_size = paddle.distributed.get_world_size() rank = paddle.distributed.get_rank() if world_size > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForSequenceClassification.from_pretrained( "ernie-1.0", num_classes=len(label_map)) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") print("============start train==========") train_ds = DuEventExtraction(args.train_data, args.tag_path) dev_ds = DuEventExtraction(args.dev_data, args.tag_path) test_ds = DuEventExtraction(args.test_data, args.tag_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_map=label_map, max_seq_len=args.max_seq_len) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token], dtype='int32'), Stack(dtype="int64") # label ): fn(list(map(trans_func, samples))) batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=batch_sampler, collate_fn=batchify_fn) dev_loader = paddle.io.DataLoader(dataset=dev_ds, batch_size=args.batch_size, collate_fn=batchify_fn) test_loader = paddle.io.DataLoader(dataset=test_ds, batch_size=args.batch_size, collate_fn=batchify_fn) num_training_steps = len(train_loader) * args.num_epoch metric = paddle.metric.Accuracy() criterion = paddle.nn.loss.CrossEntropyLoss() # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=args.learning_rate, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) step, best_performerence = 0, 0.0 model.train() for epoch in range(args.num_epoch): for idx, (input_ids, token_type_ids, labels) in enumerate(train_loader): logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() loss.backward() optimizer.step() optimizer.clear_grad() loss_item = loss.numpy().item() if step > 0 and step % args.skip_step == 0 and rank == 0: print(f'train epoch: {epoch} - step: {step} (total: {num_training_steps}) ' \ f'- loss: {loss_item:.6f} acc {acc:.5f}') if step > 0 and step % args.valid_step == 0 and rank == 0: loss_dev, acc_dev = evaluate(model, criterion, metric, dev_loader) print(f'dev step: {step} - loss: {loss_dev:.6f} accuracy: {acc_dev:.5f}, ' \ f'current best {best_performerence:.5f}') if acc_dev > best_performerence: best_performerence = acc_dev print(f'==============================================save best model ' \ f'best performerence {best_performerence:5f}') paddle.save(model.state_dict(), '{}/best.pdparams'.format(args.checkpoints)) step += 1 # save the final model if rank == 0: paddle.save(model.state_dict(), '{}/final.pdparams'.format(args.checkpoints))
def do_predict(): set_seed(args.seed) paddle.set_device(args.device) label_map = load_dict(args.tag_path) id2label = {val: key for key, val in label_map.items()} model = ErnieForSequenceClassification.from_pretrained( "ernie-1.0", num_classes=len(label_map)) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") print("============start predict==========") if not args.init_ckpt or not os.path.isfile(args.init_ckpt): raise Exception("init checkpoints {} not exist".format(args.init_ckpt)) else: state_dict = paddle.load(args.init_ckpt) model.set_dict(state_dict) print("Loaded parameters from %s" % args.init_ckpt) # load data from predict file sentences = read_by_lines(args.predict_data) # origin data format sentences = [json.loads(sent) for sent in sentences] encoded_inputs_list = [] for sent in sentences: sent = sent["text"] input_sent = [sent] # only text_a if "text_b" in sent: input_sent = [[sent, sent["text_b"]]] # add text_b example = data_2_examples(input_sent)[0] input_ids, token_type_ids = convert_example( example, tokenizer, max_seq_len=args.max_seq_len, is_test=True) encoded_inputs_list.append((input_ids, token_type_ids)) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), ): fn(samples) # Seperates data into some batches. batch_encoded_inputs = [ encoded_inputs_list[i:i + args.batch_size] for i in range(0, len(encoded_inputs_list), args.batch_size) ] results = [] model.eval() for batch in batch_encoded_inputs: input_ids, token_type_ids = batchify_fn(batch) input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids) logits = model(input_ids, token_type_ids) probs = F.softmax(logits, axis=1) probs_ids = paddle.argmax(probs, -1).numpy() probs = probs.numpy() for prob_one, p_id in zip(probs.tolist(), probs_ids.tolist()): label_probs = {} for idx, p in enumerate(prob_one): label_probs[id2label[idx]] = p results.append({"probs": label_probs, "label": id2label[p_id]}) assert len(results) == len(sentences) for sent, ret in zip(sentences, results): sent["pred"] = ret sentences = [json.dumps(sent, ensure_ascii=False) for sent in sentences] write_by_lines(args.predict_save_path, sentences) print("save data {} to {}".format(len(sentences), args.predict_save_path))
def do_train(): if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Reads label_map. label_map_path = os.path.join(args.data_path, "predicate2id.json") if not (os.path.exists(label_map_path) and os.path.isfile(label_map_path)): sys.exit("{} dose not exists or is not a file.".format(label_map_path)) with open(label_map_path, 'r', encoding='utf8') as fp: label_map = json.load(fp) num_classes = (len(label_map.keys()) - 2) * 2 + 2 # Loads pretrained model ERNIE model = ErnieForTokenClassification.from_pretrained( "ernie-1.0", num_classes=num_classes) model = paddle.DataParallel(model) tokenizer = ErnieTokenizer.from_pretrained("ernie-1.0") criterion = BCELossForDuIE() # Loads dataset. train_dataset = DuIEDataset.from_file( os.path.join(args.data_path, 'train_data.json'), tokenizer, args.max_seq_length, True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) collator = DataCollator() train_data_loader = DataLoader(dataset=train_dataset, batch_sampler=train_batch_sampler, collate_fn=collator, return_list=True) eval_file_path = os.path.join(args.data_path, 'dev_data.json') test_dataset = DuIEDataset.from_file(eval_file_path, tokenizer, args.max_seq_length, True) test_batch_sampler = paddle.io.BatchSampler(test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_data_loader = DataLoader(dataset=test_dataset, batch_sampler=test_batch_sampler, collate_fn=collator, return_list=True) # Defines learning rate strategy. steps_by_epoch = len(train_data_loader) num_training_steps = steps_by_epoch * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_ratio) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) # Starts training. global_step = 0 logging_steps = 50 save_steps = 10000 tic_train = time.time() for epoch in range(args.num_train_epochs): print("\n=====start training of %d epochs=====" % epoch) tic_epoch = time.time() model.train() for step, batch in enumerate(train_data_loader): input_ids, seq_lens, tok_to_orig_start_index, tok_to_orig_end_index, labels = batch logits = model(input_ids=input_ids) mask = (input_ids != 0).logical_and((input_ids != 1)).logical_and( (input_ids != 2)) loss = criterion(logits, labels, mask) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() loss_item = loss.numpy().item() if global_step % logging_steps == 0 and paddle.distributed.get_rank( ) == 0: print( "epoch: %d / %d, steps: %d / %d, loss: %f, speed: %.2f step/s" % (epoch, args.num_train_epochs, step, steps_by_epoch, loss_item, logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % save_steps == 0 and global_step != 0 and paddle.distributed.get_rank( ) == 0: print("\n=====start evaluating ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: print("saving checkpoing model_%d.pdparams to %s " % (global_step, args.output_dir)) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) model.train() # back to train mode global_step += 1 tic_epoch = time.time() - tic_epoch print("epoch time footprint: %d hour %d min %d sec" % (tic_epoch // 3600, (tic_epoch % 3600) // 60, tic_epoch % 60)) # Does final evaluation. if paddle.distributed.get_rank() == 0: print("\n=====start evaluating last ckpt of %d steps=====" % global_step) precision, recall, f1 = evaluate(model, criterion, test_data_loader, eval_file_path, "eval") print("precision: %.2f\t recall: %.2f\t f1: %.2f\t" % (100 * precision, 100 * recall, 100 * f1)) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) print("\n=====training complete=====")
def do_train(args): set_seed(args) paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() pinyin_vocab = Vocab.load_vocabulary( args.pinyin_vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) ernie = ErnieModel.from_pretrained(args.model_name_or_path) model = ErnieForCSC( ernie, pinyin_vocab_size=len(pinyin_vocab), pad_pinyin_id=pinyin_vocab[pinyin_vocab.pad_token]) train_ds, eval_ds = load_dataset('sighan-cn', splits=['train', 'dev']) # Extend current training dataset by providing extra training # datasets directory. The suffix of dataset file name in extra # dataset directory has to be ".txt". The data format of # dataset need to be a couple of senteces at every line, such as: # "城府宫员表示,这是过去三十六小时内第三期强烈的余震。\t政府官员表示,这是过去三十六小时内第三起强烈的余震。\n" if args.extra_train_ds_dir is not None and os.path.exists( args.extra_train_ds_dir): data = train_ds.data data_files = [ os.path.join(args.extra_train_ds_dir, data_file) for data_file in os.listdir(args.extra_train_ds_dir) if data_file.endswith(".txt") ] for data_file in data_files: ds = load_dataset( read_train_ds, data_path=data_file, splits=["train"], lazy=False) data += ds.data train_ds = MapDataset(data) det_loss_act = paddle.nn.CrossEntropyLoss( ignore_index=args.ignore_label, use_softmax=False) corr_loss_act = paddle.nn.CrossEntropyLoss( ignore_index=args.ignore_label, reduction='none') trans_func = partial( convert_example, tokenizer=tokenizer, pinyin_vocab=pinyin_vocab, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Pad(axis=0, pad_val=pinyin_vocab.token_to_idx[pinyin_vocab.pad_token]), # pinyin Pad(axis=0, dtype="int64"), # detection label Pad(axis=0, dtype="int64"), # correction label Stack(axis=0, dtype="int64") # length ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) eval_data_loader = create_dataloader( eval_ds, mode='eval', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) logger.info("Total training step: {}".format(num_training_steps)) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_steps = 1 best_f1 = -1 tic_train = time.time() for epoch in range(args.epochs): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, pinyin_ids, det_labels, corr_labels, length = batch det_error_probs, corr_logits = model(input_ids, pinyin_ids, token_type_ids) # Chinese Spelling Correction has 2 tasks: detection task and correction task. # Detection task aims to detect whether each Chinese charater has spelling error. # Correction task aims to correct each potential wrong charater to right charater. # So we need to minimize detection loss and correction loss simultaneously. # See more loss design details on https://aclanthology.org/2021.findings-acl.198.pdf det_loss = det_loss_act(det_error_probs, det_labels) corr_loss = corr_loss_act( corr_logits, corr_labels) * det_error_probs.max(axis=-1) loss = (det_loss + corr_loss).mean() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_steps, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: if paddle.distributed.get_rank() == 0: logger.info("Eval:") det_f1, corr_f1 = evaluate(model, eval_data_loader) f1 = (det_f1 + corr_f1) / 2 model_file = "model_%d" % global_steps if f1 > best_f1: # save best model paddle.save(model.state_dict(), os.path.join(args.output_dir, "best_model.pdparams")) logger.info("Save best model at {} step.".format( global_steps)) best_f1 = f1 model_file = model_file + "_best" model_file = model_file + ".pdparams" paddle.save(model.state_dict(), os.path.join(args.output_dir, model_file)) logger.info("Save model at {} step.".format(global_steps)) if args.max_steps > 0 and global_steps >= args.max_steps: return global_steps += 1