def predict_cls(self, args, ext_results): test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=self.tokenizer, label2id=self.cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batch_list = [ test_ds[idx:idx + args.batch_size] for idx in range(0, len(test_ds), args.batch_size) ] batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self.tokenizer.pad_token_id, dtype="int64"), Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id, dtype="int64"), Stack(dtype="int64")): fn(samples) results = [] for batch_data in batch_list: input_ids, token_type_ids, _ = batchify_fn(batch_data) self.cls_input_handles[0].copy_from_cpu(input_ids) self.cls_input_handles[1].copy_from_cpu(token_type_ids) self.cls_predictor.run() logits = self.cls_output_hanle.copy_to_cpu() predictions = logits.argmax(axis=1).tolist() results.extend(predictions) return results
def load_ds(datafiles): ''' intput: datafiles -- str or list[str] -- the path of train or dev sets split_train -- Boolean -- split from train or not dev_size -- int -- split how much data from train output: MapDataset ''' datas = [] def read(ds_file): with open(ds_file, 'r', encoding='utf-8') as fp: next(fp) # Skip header for line in fp.readlines(): data = line[:-1].split('\t') if len(data) == 2: yield ({'text': data[1], 'label': int(data[0])}) elif len(data) == 3: yield ({'text': data[2], 'label': int(data[1])}) if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def main(): """主函数""" paddle.set_device("gpu:1") set_seed(2021) # 设置随机数种子 logger.info("构建数据集") data_file = os.path.join(work_root, "data/NewsTrain.txt") datasets, labels = load_data(data_file) save_dict_obj(labels, os.path.join(work_root, "data/news_labels_info.json")) for i in range(3): random.shuffle(datasets) train_data_num = int(len(datasets) * 0.8) train_dataset, valid_dataset = datasets[:train_data_num], datasets[ train_data_num:] train_dataset, valid_dataset = MapDataset(train_dataset), MapDataset( valid_dataset) logger.info("数据转换word2id") tokenizer = paddlenlp.transformers.ErnieTinyTokenizer.from_pretrained( 'ernie-tiny') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=64) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): fn(samples) # 有点难以理解, 没有Torch便于理解, pad_val非常好, 可以动态设置batch最大序列长度 train_loader = create_dataloader(train_dataset, mode="train", batch_size=512, batchify_fn=batchify_fn, trans_fn=trans_func) valid_loader = create_dataloader(valid_dataset, mode="valid", batch_size=256, batchify_fn=batchify_fn, trans_fn=trans_func) epochs = 5 # 训练epoch number num_training_steps = len(train_loader) * epochs num_classes = len(labels) model, optimizer, criterion, lr_scheduler = create_classification_model( num_classes, num_training_steps) logger.info("训练模型") metric = paddle.metric.Accuracy() train(model, optimizer, criterion, lr_scheduler, metric, tokenizer, train_loader, valid_loader, epochs=epochs)
def _load_dataset(self, datafiles): def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: for line in fp.readlines(): order, words, labels = line.strip('\n').split('\t') yield {'tokens': words, 'labels': labels} if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def load_dataset(datafiles): def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: next(fp) for line in fp.readlines(): words, labels = line.strip('\n').split('\t') words = words.split('\002') labels = labels.split('\002') yield words, labels if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def load_dataset(datafiles): def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: for i, line in enumerate(fp): example = json.loads(line) words = example["tokens"] tags = example["tags"] cls_label = example["cls_label"] yield words, tags, cls_label if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def news_title_classification(): """文本分类---新闻标题分类""" data = json.loads(request.data) if isinstance(data["texts"], str): data["texts"] = [data["texts"]] if isinstance(data["texts"], list): datasets = [] for text in data["texts"]: datasets.append({"text": text}) datasets = MapDataset(datasets) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=64) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): fn(samples) data_loader = create_dataloader(datasets, mode="test", batch_size=32, batchify_fn=batchify_fn, trans_fn=trans_func) labels = inference(model, data_loader) labels_text = [] for id, label in enumerate(labels): labels_text.append(labels_info[str(label)]) return jsonify(status="Success", results=labels_text) else: return jsonify(status="Failure", message="Incorrect parameter data type.")
def _pre_process_text(self, input_texts, max_seq_len=128, batch_size=1): infer_data = [] max_predict_len = max_seq_len - self.summary_num - 1 short_input_texts = self._split_long_text2short_text_list( input_texts, max_predict_len) for text in short_input_texts: tokens = ["[CLS%i]" % i for i in range(1, self.summary_num)] + list(text) tokenized_input = self._tokenizer(tokens, return_length=True, is_split_into_words=True, max_seq_len=max_seq_len) infer_data.append([ tokenized_input['input_ids'], tokenized_input['token_type_ids'], tokenized_input['seq_len'] ]) infer_ds = MapDataset(infer_data) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=self._tokenizer.pad_token_id, dtype='int64' ), # input_ids Pad(axis=0, pad_val=self._tokenizer.pad_token_type_id, dtype='int64'), # token_type_ids Stack(dtype='int64'), # seq_len ): fn(samples) infer_data_loader = paddle.io.DataLoader(infer_ds, collate_fn=batchify_fn, num_workers=0, batch_size=batch_size, shuffle=False, return_list=True) return infer_data_loader, short_input_texts
def predict_cls(args, ext_results): # load dict model_name = "skep_ernie_1.0_large_ch" cls_label2id, cls_id2label = load_dict(args.cls_label_path) tokenizer = SkepTokenizer.from_pretrained(model_name) test_ds = MapDataset(ext_results) trans_func = partial(convert_example_to_feature_cls, tokenizer=tokenizer, label2id=cls_label2id, max_seq_len=args.cls_max_seq_len, is_test=True) test_ds = test_ds.map(trans_func, lazy=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), Pad(axis=0, pad_val=tokenizer.pad_token_type_id), Stack(dtype="int64") ): fn(samples) # set shuffle is False test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_loader = paddle.io.DataLoader(test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn) print("test data loaded.") # load cls model cls_state_dict = paddle.load(args.cls_model_path) cls_model = SkepForSequenceClassification.from_pretrained( model_name, num_classes=len(cls_label2id)) cls_model.load_dict(cls_state_dict) print("classification model loaded.") cls_model.eval() results = [] for bid, batch_data in enumerate(test_loader): input_ids, token_type_ids, seq_lens = batch_data logits = cls_model(input_ids, token_type_ids=token_type_ids) predictions = logits.argmax(axis=1).numpy().tolist() results.extend(predictions) results = [cls_id2label[pred_id] for pred_id in results] return results
def build_data_loader(args, tokenizer): """ build corpus_data_loader and text_data_loader """ id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] corpus_data_loader = create_dataloader(corpus_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) # build text data_loader text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) text_ds = MapDataset(text_list) text_data_loader = create_dataloader(text_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) d = { "text_data_loader": text_data_loader, "corpus_data_loader": corpus_data_loader, "id2corpus": id2corpus, "text2similar_text": text2similar_text, "text_list": text_list } return d
def load_ds_xnli(datafiles): datas = [] def read(ds_file): with open(ds_file, 'r', encoding='utf-8') as fp: # next(fp) # Skip header for line in fp.readlines(): data = line.strip().split('\t', 2) first, second, third = data yield ({ "sentence1": first, "sentence2": second, "label": third }) if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def _create_examples(self, datasets, phase, task_label_description): """Creates examples for the training and dev sets.""" examples = [] if phase == "train": for example in datasets: true_label = example["label"] neg_examples = [] for label, label_description in task_label_description.items(): new_example = dict() new_example["sentence1"] = example["text"] new_example["sentence2"] = example["target"][ "span1_text"] + label_description + example["target"][ "span2_text"] # Todo: handle imbanlanced example, maybe hurt model performance if true_label == label: new_example["label"] = 1 examples.append(new_example) else: new_example["label"] = 0 neg_examples.append(new_example) neg_examples = random.sample(neg_examples, self.neg_num) examples.extend(neg_examples) elif phase == "dev": for example in datasets: true_label = str(example["label"]) for label, label_description in task_label_description.items(): new_example = dict() new_example["sentence1"] = example["text"] new_example["sentence2"] = example["target"][ "span1_text"] + label_description + example["target"][ "span2_text"] # Get true_label's index at task_label_description for evaluate true_label_index = list( task_label_description.keys()).index(true_label) new_example["label"] = true_label_index examples.append(new_example) elif phase == "test": for example in datasets: for label, label_description in task_label_description.items(): new_example = dict() new_example["sentence1"] = example["text"] new_example["sentence2"] = example["target"][ "span1_text"] + label_description + example["target"][ "span2_text"] examples.append(new_example) return MapDataset(examples)
def load_dataset(datafiles): def read(data_path): with open(data_path, 'r', encoding='utf-8') as fp: if "infer" in data_path: next(fp) for line in fp: line = line.strip() if "infer" in data_path: words = list(line) yield [words] else: words, labels = line.split("\t") words = words.split(CHAR_DELIMITER) labels = labels.split(CHAR_DELIMITER) assert len(words) == len( labels), "The word %s is not match with the label %s" % ( words, labels) yield [words, labels] if isinstance(datafiles, str): return MapDataset(list(read(datafiles))) elif isinstance(datafiles, list) or isinstance(datafiles, tuple): return [MapDataset(list(read(datafile))) for datafile in datafiles]
def create_data_loader(args, dataset_class, trans_func, batchify_fn, mode): dataset = dataset_class(args.data_dir, mode) dataset = MapDataset(dataset).map(trans_func, lazy=True) if mode == 'train': batch_sampler = DistributedBatchSampler( dataset, batch_size=args.batch_size, shuffle=True) else: batch_sampler = BatchSampler( dataset, batch_size=args.test_batch_size, shuffle=False) data_loader = DataLoader( dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True) return data_loader
def _create_examples(self, datasets, phase, task_label_description): """Creates examples for the training and dev sets.""" examples = [] if phase == "train": for example in datasets: neg_examples = [] true_label_index = int(example["answer"]) candidates = example["candidates"] for idx, cantidate in enumerate(candidates): new_example = dict() new_example["sentence1"] = example["content"] new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate if idx == true_label_index: new_example["label"] = 1 examples.append(new_example) else: new_example["label"] = 0 neg_examples.append(new_example) examples.extend(neg_examples) elif phase == "dev": for example in datasets: true_label = str(example["answer"]) candidates = example["candidates"] for idx, cantidate in enumerate(candidates): new_example = dict() new_example["sentence1"] = example["content"] new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate # Get true_label's index at task_label_description for evaluate true_label_index = int(true_label) new_example["label"] = true_label_index examples.append(new_example) elif phase == "test": for example in datasets: candidates = example["candidates"] for idx, cantidate in enumerate(candidates): new_example = dict() new_example["sentence1"] = example["content"] new_example["sentence2"] = "位置#idiom#处的成语应该填写" + cantidate examples.append(new_example) return MapDataset(examples)
def convert_features_to_dataset(features): """ Converts a list of feature dictionaries (one for each sample) into a Paddle Dataset. :param features: A list of dictionaries. Each dictionary corresponds to one sample. Its keys are the names of the type of feature and the keys are the features themselves. :Return: a Paddle dataset and a list of tensor names. """ # features can be an empty list in cases where down sampling occurs if len(features) == 0: return None, None tensor_names = list(features[0].keys()) all_tensors = [] for t_name in tensor_names: try: # Checking whether a non-integer will be silently converted to Paddle.long check = features[0][t_name] if isinstance(check, numbers.Number): base = check # extract a base variable from a nested lists or tuples elif isinstance(check, list): base = list(flatten_list(check))[0] # extract a base variable from numpy arrays else: base = check.ravel()[0] if not np.issubdtype(type(base), np.integer): logger.warning( f"Problem during conversion to Paddle tensors:\n" f"A non-integer value for feature '{t_name}' with a value of: " f"'{base}' will be converted to a Paddle tensor of dtype long." ) except: logger.debug(f"Could not determine type for feature '{t_name}'. " "Converting now to a tensor of default type long.") # Convert all remaining python objects to Paddle long tensors cur_tensor = [sample[t_name] for sample in features] all_tensors.append(cur_tensor) # Todo(tianxin): When set to IterDataset, throw Exception with paddle.io.BatchSampler # all_tensors: List[List[all_token_ids], List[all_segment_ids]] # list(zip(*all_tensors)): List[([token_ids], [segment_ids]), ([token_ids], [segment_ids])] # For Question Answering: tensor_names: ['input_ids', 'padding_mask', 'segment_ids', 'passage_start_t', 'start_of_word', 'labels', 'id', 'seq_2_start_t', 'span_mask'] dataset = MapDataset(list(zip(*all_tensors))) return dataset, tensor_names
def gen_pair(dataset, pool_size=100): """ Generate triplet randomly based on dataset Args: dataset: A `MapDataset` or `IterDataset` or a tuple of those. Each example is composed of 2 texts: exampe["query"], example["title"] pool_size: the number of example to sample negative example randomly Return: dataset: A `MapDataset` or `IterDataset` or a tuple of those. Each example is composed of 2 texts: exampe["query"], example["pos_title"]、example["neg_title"] """ if len(dataset) < pool_size: pool_size = len(dataset) new_examples = [] pool = [] tmp_exmaples = [] for example in dataset: label = example["label"] # Filter negative example if label == 0: continue tmp_exmaples.append(example) pool.append(example["title"]) if len(pool) >= pool_size: np.random.shuffle(pool) for idx, example in enumerate(tmp_exmaples): example["neg_title"] = pool[idx] new_examples.append(example) tmp_exmaples = [] pool = [] else: continue return MapDataset(new_examples)
model = paddle.DataParallel(model) # Load pretrained semantic model if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) logger.info("Loaded parameters from %s" % args.params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") id2corpus = gen_id2corpus(args.corpus_file) # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) corpus_data_loader = create_dataloader(corpus_ds, mode='predict', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) # Need better way to get inner model of DataParallel inner_model = model._layers final_index = build_index(args, corpus_data_loader, inner_model) text_list, text2similar_text = gen_text_file(args.similar_text_pair_file) query_ds = MapDataset(text_list)
def train(args): # 加载数据 trainset = IMDBDataset(is_training=True) testset = IMDBDataset(is_training=False) # 封装成MapDataSet的形式 train_ds = MapDataset(trainset, label_list=[0, 1]) test_ds = MapDataset(testset, label_list=[0, 1]) # 定义XLNet的Tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) # 构造train_data_loader 和 dev_data_loader train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = MapDataset(testset) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # 训练配置 # 固定随机种子 set_seed(args) # 设定运行环境 use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') num_classes = len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) #paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # 设定lr_scheduler if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # 制定优化器 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 模型训练 metric = Accuracy() # 定义损失函数 loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: exit(0) tic_train += time.time() - tic_eval
else: paddle.set_device("gpu:2") is_server = True model_dir = os.path.join(work_root, "models/epoch_{}".format(3)) labels_info = load_json_obj( os.path.join(work_root, "data/news_labels_info.json")) model = paddlenlp.transformers.ErnieForSequenceClassification.from_pretrained( model_dir) tokenizer = paddlenlp.transformers.ErnieTinyTokenizer.from_pretrained( model_dir) if is_server: app.run(host="0.0.0.0", port=8280) # 启动服务 else: test_data_file = os.path.join(work_root, "data/NewsTest.txt") test_datasets, _ = load_data(test_data_file) test_datasets = MapDataset(test_datasets[:1000]) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=64) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): fn(samples) test_loader = create_dataloader(test_datasets, mode="test", batch_size=32, batchify_fn=batchify_fn, trans_fn=trans_func) labels = inference(model, test_loader)
# pretrained_model=ErnieModel.from_pretrained("ernie-1.0") model = SimCSE(pretrained_model, output_emb_size=output_emb_size) # Load pretrained semantic model if params_path and os.path.isfile(params_path): state_dict = paddle.load(params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % params_path) else: raise ValueError( "Please set --params_path with correct pretrained model file") # conver_example function's input must be dict corpus_list = [{idx: text} for idx, text in id2corpus.items()] corpus_ds = MapDataset(corpus_list) corpus_data_loader = create_dataloader( corpus_ds, mode='predict', batch_size=batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) all_embeddings = [] model.eval() with paddle.no_grad(): for batch_data in corpus_data_loader: input_ids, token_type_ids = batch_data input_ids = paddle.to_tensor(input_ids) token_type_ids = paddle.to_tensor(token_type_ids)
def do_train(args): set_seed(args) paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() pinyin_vocab = Vocab.load_vocabulary( args.pinyin_vocab_file_path, unk_token='[UNK]', pad_token='[PAD]') tokenizer = ErnieTokenizer.from_pretrained(args.model_name_or_path) ernie = ErnieModel.from_pretrained(args.model_name_or_path) model = ErnieForCSC( ernie, pinyin_vocab_size=len(pinyin_vocab), pad_pinyin_id=pinyin_vocab[pinyin_vocab.pad_token]) train_ds, eval_ds = load_dataset('sighan-cn', splits=['train', 'dev']) # Extend current training dataset by providing extra training # datasets directory. The suffix of dataset file name in extra # dataset directory has to be ".txt". The data format of # dataset need to be a couple of senteces at every line, such as: # "城府宫员表示,这是过去三十六小时内第三期强烈的余震。\t政府官员表示,这是过去三十六小时内第三起强烈的余震。\n" if args.extra_train_ds_dir is not None and os.path.exists( args.extra_train_ds_dir): data = train_ds.data data_files = [ os.path.join(args.extra_train_ds_dir, data_file) for data_file in os.listdir(args.extra_train_ds_dir) if data_file.endswith(".txt") ] for data_file in data_files: ds = load_dataset( read_train_ds, data_path=data_file, splits=["train"], lazy=False) data += ds.data train_ds = MapDataset(data) det_loss_act = paddle.nn.CrossEntropyLoss( ignore_index=args.ignore_label, use_softmax=False) corr_loss_act = paddle.nn.CrossEntropyLoss( ignore_index=args.ignore_label, reduction='none') trans_func = partial( convert_example, tokenizer=tokenizer, pinyin_vocab=pinyin_vocab, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Pad(axis=0, pad_val=pinyin_vocab.token_to_idx[pinyin_vocab.pad_token]), # pinyin Pad(axis=0, dtype="int64"), # detection label Pad(axis=0, dtype="int64"), # correction label Stack(axis=0, dtype="int64") # length ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) eval_data_loader = create_dataloader( eval_ds, mode='eval', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) logger.info("Total training step: {}".format(num_training_steps)) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_steps = 1 best_f1 = -1 tic_train = time.time() for epoch in range(args.epochs): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, pinyin_ids, det_labels, corr_labels, length = batch det_error_probs, corr_logits = model(input_ids, pinyin_ids, token_type_ids) # Chinese Spelling Correction has 2 tasks: detection task and correction task. # Detection task aims to detect whether each Chinese charater has spelling error. # Correction task aims to correct each potential wrong charater to right charater. # So we need to minimize detection loss and correction loss simultaneously. # See more loss design details on https://aclanthology.org/2021.findings-acl.198.pdf det_loss = det_loss_act(det_error_probs, det_labels) corr_loss = corr_loss_act( corr_logits, corr_labels) * det_error_probs.max(axis=-1) loss = (det_loss + corr_loss).mean() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_steps % args.logging_steps == 0: logger.info( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_steps, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_steps % args.save_steps == 0: if paddle.distributed.get_rank() == 0: logger.info("Eval:") det_f1, corr_f1 = evaluate(model, eval_data_loader) f1 = (det_f1 + corr_f1) / 2 model_file = "model_%d" % global_steps if f1 > best_f1: # save best model paddle.save(model.state_dict(), os.path.join(args.output_dir, "best_model.pdparams")) logger.info("Save best model at {} step.".format( global_steps)) best_f1 = f1 model_file = model_file + "_best" model_file = model_file + ".pdparams" paddle.save(model.state_dict(), os.path.join(args.output_dir, model_file)) logger.info("Save model at {} step.".format(global_steps)) if args.max_steps > 0 and global_steps >= args.max_steps: return global_steps += 1