def main(): args = parse_args() predictor = Predictor.create_predictor(args) args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained( os.path.dirname(args.model_name_or_path)) if args.version_2_with_negative: raw_dataset = load_dataset('squad_v2', split='validation') else: raw_dataset = load_dataset('squad', split='validation') column_names = raw_dataset.column_names dataset = raw_dataset.map(partial(prepare_validation_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) predictor = Predictor.create_predictor(args) predictor.predict(dataset, raw_dataset, args=args, collate_fn=batchify_fn)
def init_roberta_var(args): tokenizer = None if args.language == "ch": tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForSequenceClassification.from_pretrained( args.from_pretrained, hidden_dropout_prob=0, attention_probs_dropout_prob=0, dropout=0, num_labels=2, name='', return_inter_score=True) map_fn = partial(map_fn_senti, tokenizer=tokenizer, language=args.language) dev_ds = SentiData().read(os.path.join(args.data_dir, 'dev'), args.language) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id) }): fn(samples) dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dataloader
def init_roberta_var(args): if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained(args.from_pretrained) map_fn = functools.partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer) dev_ds = RCInterpret().read(os.path.join(args.data_dir, 'dev')) #dev_ds = load_dataset('squad', splits='dev_v2', data_files=None) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dev_dataloader, dev_ds
def main(): args = parse_args() predictor = Predictor.create_predictor(args) args.task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] sentence1_key, sentence2_key = task_to_keys[args.task_name] test_ds = load_dataset('glue', args.task_name, split="test") tokenizer = tokenizer_class.from_pretrained( os.path.dirname(args.model_path)) def preprocess_function(examples): # Tokenize the texts texts = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*texts, max_seq_len=args.max_seq_length) if "label" in examples: # In all cases, rename the column to labels because the model will expect that. result["labels"] = examples["label"] return result test_ds = test_ds.map(preprocess_function) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64" ), # segment }): fn(samples) predictor.predict(test_ds, batch_size=args.batch_size, collate_fn=batchify_fn)
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.map(trans_func) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def init_roberta_var(args): if args.language == 'ch': tokenizer = RobertaTokenizer.from_pretrained(args.from_pretrained) else: tokenizer = RobertaBPETokenizer.from_pretrained(args.from_pretrained) model = RobertaForQuestionAnswering.from_pretrained( args.from_pretrained, num_classes=args.num_classes) map_fn = partial(map_fn_DuCheckList, args=args, tokenizer=tokenizer) dev_ds = RCInterpret().read(args.data_dir) dev_ds.map(map_fn, batched=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "offset_mapping": Pad(axis=0, pad_val=tokenizer.pad_token_id), "overflow_to_sample": Stack(dtype='int32'), }): fn(samples) dev_dataloader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) return model, tokenizer, dev_dataloader, dev_ds
def load_squad_dataset(args): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') features_fn = prepare_train_features if args.is_training else prepare_validation_features if args.is_training: raw_dataset = load_dataset('squad', split='train') else: raw_dataset = load_dataset('squad', split='validation') column_names = raw_dataset.column_names dataset = raw_dataset.map(partial( features_fn, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) bs = args.micro_batch_size * args.grad_acc_factor * args.batches_per_step * args.num_replica args.batch_size = bs if args.is_training: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=True) else: train_batch_sampler = BatchSampler( dataset, batch_size=bs, shuffle=args.shuffle, drop_last=False) if args.is_training: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack(), "start_positions": Stack(), "end_positions": Stack() }): fn(samples) else: collate_fn = lambda samples, fn=Dict({ "input_ids": Stack(), "token_type_ids": Stack(), "position_ids": Stack(), "input_mask": Stack()}): fn(samples) data_loader = DataLoader( dataset=dataset, batch_sampler=train_batch_sampler, collate_fn=collate_fn, return_list=True) return raw_dataset, data_loader
def __init__(self, tokenizer, batch_pad=None): self.batch_pad = batch_pad self.mask_token_id = tokenizer.mask_token_id self.pad_token_id = tokenizer.pad_token_id self.token_len = tokenizer.vocab_size if batch_pad is None: self.batch_pad = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=self.pad_token_id, dtype='int64'), # input # 'token_type_ids': Pad(axis=0, pad_val=0, dtype='int64'), # segment 'special_tokens_mask': Pad(axis=0, pad_val=True, dtype='int64') # segment }): fn(samples) else: self.batch_pad = batch_pad
def __init__(self, tokenizer, batch_size, doc_stride, max_seq_length): self.tokenizer = tokenizer self.batch_size = batch_size self.doc_stride = doc_stride self.max_seq_length = max_seq_length self._train_input_fn = Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), "answerable_label": Stack(dtype="int64") }) self._dev_input_fn = Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) })
def create_test_dataloader(args): ''' 构建测试用的dataloader Create dataset, tokenizer and dataloader. input: args: 配置文件提供的参数借口 return: test_data_loader ''' no_entity_id = 0 # 加载dataset test_ds = load_dataset('TEDTalk', splits=('test'), lazy=False) # 构建dataloader model_name_or_path = args.model_name_or_path tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=args.ignore_label, dtype='int64') # label }): fn(samples) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) return test_data_loader
def get_train_dataloader(tokenizer, args): splits = "train" data_dir = args.data_dir filename = os.path.join(data_dir, "cmrc2018_" + splits + ".pkl") if os.path.exists(filename): ds = load_pickle(filename) else: ds = load_dataset("cmrc2018", splits=splits) ds.map( partial(prepare_train_features_paddlenlp, tokenizer=tokenizer, args=args), batched=True, lazy=False, ) save_pickle(ds, filename) batch_sampler = BatchSampler(ds, batch_size=args.train_batch_size, shuffle=True) batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=0), "pinyin_ids": Pad(axis=0, pad_val=0), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), }): fn(samples) data_loader = DataLoader( dataset=ds, batch_sampler=batch_sampler, collate_fn=batchify_fn, num_workers=args.num_workers, return_list=True, ) return data_loader
def evaluate(args, is_test=True): # 加载模型 model_state = paddle.load(args.model_path) model = ErnieForQuestionAnswering.from_pretrained(args.model_name) model.load_dict(model_state) model.eval() # 加载数据 train_ds, dev_ds, test_ds = load_dataset('dureader_robust', splits=('train', 'dev', 'test')) tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained( args.model_name) test_trans_func = partial(prepare_validation_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) test_ds.map(test_trans_func, batched=True, num_workers=4) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) test_data_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) all_start_logits = [] all_end_logits = [] tic_eval = time.time() for batch in test_data_loader: input_ids, token_type_ids = batch start_logits_tensor, end_logits_tensor = model(input_ids, token_type_ids) for idx in range(start_logits_tensor.shape[0]): if len(all_start_logits) % 10 == 0 and len(all_start_logits): print("Processing example: %d" % len(all_start_logits)) print('time per 1000:', time.time() - tic_eval) tic_eval = time.time() all_start_logits.append(start_logits_tensor.numpy()[idx]) all_end_logits.append(end_logits_tensor.numpy()[idx]) all_predictions, _, _ = compute_prediction( test_data_loader.dataset.data, test_data_loader.dataset.new_data, (all_start_logits, all_end_logits), False, 20, 30) if is_test: # Can also write all_nbest_json and scores_diff_json files if needed with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps(all_predictions, ensure_ascii=False, indent=4) + "\n") else: squad_evaluate(examples=test_data_loader.dataset.data, preds=all_predictions, is_whitespace_splited=False) count = 0 for example in test_data_loader.dataset.data: count += 1 print() print('问题:', example['question']) print('原文:', ''.join(example['context'])) print('答案:', all_predictions[example['id']]) if count >= 5: break model.train()
def test_dict(self): batchify_fn = Dict({'text': Pad(axis=0, pad_val=0), 'label': Stack()}) result = batchify_fn(self.input) self.check_output_equal(result[0], self.expected_result[0]) self.check_output_equal(result[1], self.expected_result[1])
def train(args): # 加载数据集 train_ds, dev_ds, test_ds = load_dataset('dureader_robust', splits=('train', 'dev', 'test')) tokenizer = paddlenlp.transformers.ErnieTokenizer.from_pretrained(args.model_name) train_trans_func = partial(prepare_train_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) train_ds.map(train_trans_func, batched=True, num_workers=4) dev_trans_func = partial(prepare_validation_features, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, tokenizer=tokenizer) dev_ds.map(dev_trans_func, batched=True, num_workers=4) test_ds.map(dev_trans_func, batched=True, num_workers=4) # 定义BatchSampler train_batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=args.batch_size, shuffle=False) # 定义batchify_fn train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) # 构造DataLoader train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_data_loader = paddle.io.DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) # 训练配置相关 num_training_steps = len(train_data_loader) * args.epochs use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') lr_scheduler = paddlenlp.transformers.LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) model = ErnieForQuestionAnswering.from_pretrained(args.model_name) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 训练代码 model.train() criterion = CrossEntropyLossForRobust() global_step = 0 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): global_step += 1 input_ids, segment_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % 100 == 0 : print("global step %d, epoch: %d, batch: %d, loss: %.5f" % (global_step, epoch, step, loss)) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() paddle.save(model.state_dict(), args.save_model_path) paddle.save(model.state_dict(), args.save_opt_path) evaluate(model=model, data_loader=dev_data_loader)
def batchify_fn(data): _batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'position_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'attention_mask': Pad(axis=0, pad_val=0, dtype='float32'), }): fn(samples) ent_label = [x['ent_label'] for x in data] spo_label = [x['spo_label'] for x in data] input_ids, token_type_ids, position_ids, masks = _batchify_fn(data) batch_size, batch_len = input_ids.shape num_classes = len(train_ds.label_list) # Create one-hot labels. # # For example, # - text: # [CLS], 局, 部, 皮, 肤, 感, 染, 引, 起, 的, 皮, 疹, 等, [SEP] # # - ent_label (obj: `list`): # [(0, 5), (9, 10)] # ['局部皮肤感染', '皮疹'] # # - one_hot_ent_label: # shape (sequence_length, 2) # [[ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], # start index # [ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]] # end index # # - spo_label (obj: `list`): # [(0, 23, 9)] # [('局部皮肤感染', '相关(导致)', '皮疹')], where entities # are encoded by their start indexes. # # - one_hot_spo_label: # shape (num_predicate, sequence_length, sequence_length) # [..., # [..., [0, ..., 1, ..., 0], ...], # for predicate '相关(导致)' # ...] # the value at [23, 1, 10] is set as 1 # one_hot_ent_label = np.zeros([batch_size, batch_len, 2], dtype=np.float32) one_hot_spo_label = np.zeros( [batch_size, num_classes, batch_len, batch_len], dtype=np.float32) for idx, ent_idxs in enumerate(ent_label): # Shift index by 1 because input_ids start with [CLS] here. for x, y in ent_idxs: x = x + 1 y = y + 1 if x > 0 and x < batch_len and y < batch_len: one_hot_ent_label[idx, x, 0] = 1 one_hot_ent_label[idx, y, 1] = 1 for idx, spo_idxs in enumerate(spo_label): for s, p, o in spo_idxs: s_id = s[0] + 1 o_id = o[0] + 1 if s_id > 0 and s_id < batch_len and o_id < batch_len: one_hot_spo_label[idx, p, s_id, o_id] = 1 # one_hot_xxx_label are used for loss computation. # xxx_label are used for metric computation. ent_label = [one_hot_ent_label, ent_label] spo_label = [one_hot_spo_label, spo_label] return input_ids, token_type_ids, position_ids, masks, ent_label, spo_label
def do_predict(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_examples, predict_examples = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_examples.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) id2label = dict(enumerate(label_list)) predict_examples = predict_examples.select( range(len(predict_examples) - 1)) predict_ds = predict_examples.map(tokenize_and_align_labels, batched=True) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(predict_examples, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Create dataset, tokenizer and dataloader. if args.dataset == "peoples_daily_ner": train_ds, dev_ds, test_ds = load_dataset( args.dataset, splits=('train', 'dev', 'test'), lazy=False) else: train_ds, test_ds = load_dataset( args.dataset, splits=('train', 'test'), lazy=False) AutoForTokenClassification, AutoTokenizer = MODEL_CLASSES[args.model_type] tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial( tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32'), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader( dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader( dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.dataset == "peoples_daily_ner": dev_ds = dev_ds.map(trans_func) dev_data_loader = DataLoader( dataset=dev_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: if args.dataset == "peoples_daily_ner": evaluate(model, loss_fct, metric, dev_data_loader, label_num, "valid") evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") paddle.save(model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step)) if global_step >= num_training_steps: return
token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 return tokenized_examples train_ds.map(prepare_train_features, lazy=False) print(train_ds[0]) print(train_ds[1]) print(len(train_ds)) print('-----------------------------------------------------') train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input "segment_ids": Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment "start_positions": Stack(dtype="int64"), # start_pos "end_positions": Stack(dtype="int64") # end_pos }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_size=8, collate_fn=train_batchify_fn, return_list=True) for batch in train_data_loader: print(batch[0]) print(batch[1]) print(batch[2]) print(batch[3]) break
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) train_ds, dev_ds, test_ds = load_dataset( 'dureader_yesno', splits=['train', 'dev', 'test']) trans_func = partial(convert_example, tokenizer=tokenizer) train_batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'labels': Stack(dtype="int64") }): fn(samples) test_batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'id': Stack() }): fn(samples) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=train_batchify_fn, return_list=True) test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) model = model_class.from_pretrained( args.model_name_or_path, num_classes=len(train_ds.label_list)) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, label) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, metric, dev_data_loader) output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: predictions = predict(model, test_data_loader) with open('prediction.json', "w") as writer: writer.write( json.dumps( predictions, ensure_ascii=False, indent=4) + "\n")
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) set_seed(args) train_examples, dev_examples, test_examples = load_dataset( 'cmrc2018', split=["train", "validation", "test"]) column_names = train_examples.column_names if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples['answers'][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] context_index = 1 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) train_ds = train_examples.map(prepare_train_features, batched=True, remove_columns=column_names, num_proc=1) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps > 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break evaluate(model, dev_examples, dev_data_loader, args) if args.do_predict and rank == 0: test_ds = test_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) evaluate(model, test_examples, test_data_loader, args, do_eval=False)
def do_predict(args): paddle.set_device("gpu" if args.use_gpu else "cpu") train_ds, predict_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id ), # segment 'seq_len': Stack(), 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) raw_data = predict_ds.data id2label = dict(enumerate(predict_ds.label_list)) predict_ds = predict_ds.map(trans_func) predict_data_loader = DataLoader(dataset=predict_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) model.eval() pred_list = [] len_list = [] for step, batch in enumerate(predict_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) pred = paddle.argmax(logits, axis=-1) pred_list.append(pred.numpy()) len_list.append(length.numpy()) preds = parse_decodes(raw_data, id2label, pred_list, len_list) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(preds)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(preds[:10]))
def do_eval(args): paddle.set_device(args.device) # Create dataset, tokenizer and dataloader. train_ds, eval_ds = load_dataset('msra_ner', split=('train', 'test')) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) no_entity_id = 0 def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs ignore_label = -100 batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), 'labels': Pad(axis=0, pad_val=ignore_label, dtype='int64') # label }): fn(samples) eval_ds = eval_ds.select(range(len(eval_ds) - 1)) eval_ds = eval_ds.map(tokenize_and_align_labels, batched=True) eval_data_loader = DataLoader(dataset=eval_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if args.init_checkpoint_path: model_dict = paddle.load(args.init_checkpoint_path) model.set_dict(model_dict) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): input_ids, token_type_ids, length, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits, labels) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( length, preds, labels) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def run(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() task_name = args.task_name.lower() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Let's label those examples! for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Minus one more to reach actual text token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i]["start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 return tokenized_examples if args.do_train: if args.train_file: train_ds = load_dataset(task_name, data_files=args.train_file) else: train_ds = load_dataset(task_name, splits='train') train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup( args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, start_positions, end_positions = batch logits = model( input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if (not args.n_gpu > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_predict and paddle.distributed.get_rank() == 0: if args.predict_file: dev_ds = load_dataset(task_name, data_files=args.predict_file) else: dev_ds = load_dataset(task_name, splits='dev') dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, args)
def do_train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() train_ds, test_ds = load_dataset('msra_ner', splits=('train', 'test'), lazy=False) tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path) label_list = train_ds.label_list label_num = len(label_list) no_entity_id = label_num - 1 trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) ignore_label = -100 batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id ), # segment 'seq_len': Stack(), # seq_len 'labels': Pad(axis=0, pad_val=ignore_label) # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) test_ds = test_ds.map(trans_func) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) model = BertForTokenClassification.from_pretrained(args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=ignore_label) metric = ChunkEvaluator(label_list=label_list) global_step = 0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, _, labels = batch logits = model(input_ids, token_type_ids) loss = loss_fct(logits.reshape([-1, label_num]), labels.reshape([-1])) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == last_step: if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, loss_fct, metric, test_data_loader, label_num) paddle.save( model.state_dict(), os.path.join(args.output_dir, "model_%d.pdparams" % global_step))
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds = load_dataset('cblue', 'CMeEE', splits=['train', 'dev']) model = ElectraForBinaryTokenClassification.from_pretrained( 'ernie-health-chinese', num_classes=[len(x) for x in train_ds.label_list]) tokenizer = ElectraTokenizer.from_pretrained('ernie-health-chinese') label_list = train_ds.label_list pad_label_id = [len(label_list[0]) - 1, len(label_list[1]) - 1] ignore_label_id = -100 trans_func = partial(convert_example_ner, tokenizer=tokenizer, max_seq_length=args.max_seq_length, pad_label_id=pad_label_id) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int64'), 'position_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int64'), 'attention_mask': Pad(axis=0, pad_val=0, dtype='float32'), 'label_oth': Pad(axis=0, pad_val=pad_label_id[0], dtype='int64'), 'label_sym': Pad(axis=0, pad_val=pad_label_id[1], dtype='int64') }): fn(samples) train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt: if not os.path.isfile(args.init_from_ckpt): raise ValueError('init_from_ckpt is not a valid model filename.') state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ['bias', 'norm']) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.functional.softmax_with_cross_entropy metric = NERChunkEvaluator(label_list) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() total_train_time = 0 for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, position_ids, masks, label_oth, label_sym = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=['layer_norm', 'softmax', 'gelu'], ): logits = model(input_ids, token_type_ids, position_ids) loss_mask = paddle.unsqueeze(masks, 2) losses = [(criterion(x, y.unsqueeze(2)) * loss_mask).mean() for x, y in zip(logits, [label_oth, label_sym])] loss = losses[0] + losses[1] lengths = paddle.sum(masks, axis=1) preds = [paddle.argmax(x, axis=-1) for x in logits] correct = metric.compute(lengths, preds, [label_oth, label_sym]) metric.update(correct) _, _, f1 = metric.accumulate() if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_step += 1 if global_step % args.logging_steps == 0 and rank == 0: time_diff = time.time() - tic_train total_train_time += time_diff print( 'global step %d, epoch: %d, batch: %d, loss: %.5f, loss symptom: %.5f, loss others: %.5f, f1: %.5f, speed: %.2f step/s, learning_rate: %f' % (global_step, epoch, step, loss, losses[1], losses[0], f1, args.logging_steps / time_diff, lr_scheduler.get_lr())) tic_train = time.time() if global_step % args.valid_steps == 0 and rank == 0: evaluate(model, criterion, metric, dev_data_loader) tic_train = time.time() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, 'model_%d' % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) if paddle.distributed.get_world_size() > 1: model._layers.save_pretrained(save_dir) else: model.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) tic_train = time.time() print('Speed: %.2f steps/s' % (global_step / total_train_time))
def run(args): if args.do_train: assert args.batch_size % args.gradient_accumulation_steps == 0, \ "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`." paddle.set_device(args.device) set_seed(args) max_seq_length = args.max_seq_length max_num_choices = 10 def preprocess_function(examples, do_predict=False): SPIECE_UNDERLINE = '▁' def _is_chinese_char(cp): if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False def is_fuhao(c): if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ or c == '‘' or c == '’': return True return False def _tokenize_chinese_chars(text): """Adds whitespace around any CJK character.""" output = [] is_blank = False for index, char in enumerate(text): cp = ord(char) if is_blank: output.append(char) if context[index - 12:index + 1].startswith("#idiom"): is_blank = False output.append(SPIECE_UNDERLINE) else: if text[index:index + 6] == "#idiom": is_blank = True if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) elif _is_chinese_char(cp) or is_fuhao(char): if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) output.append(SPIECE_UNDERLINE) else: output.append(char) return "".join(output) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F or c == SPIECE_UNDERLINE: return True return False def add_tokens_for_around(tokens, pos, num_tokens): num_l = num_tokens // 2 num_r = num_tokens - num_l if pos >= num_l and (len(tokens) - 1 - pos) >= num_r: tokens_l = tokens[pos - num_l:pos] tokens_r = tokens[pos + 1:pos + 1 + num_r] elif pos <= num_l: tokens_l = tokens[:pos] right_len = num_tokens - len(tokens_l) tokens_r = tokens[pos + 1:pos + 1 + right_len] elif (len(tokens) - 1 - pos) <= num_r: tokens_r = tokens[pos + 1:] left_len = num_tokens - len(tokens_r) tokens_l = tokens[pos - left_len:pos] else: raise ValueError('impossible') return tokens_l, tokens_r max_tokens_for_doc = max_seq_length - 3 num_tokens = max_tokens_for_doc - 5 num_examples = len(examples.data["candidates"]) if do_predict: result = {"input_ids": [], "token_type_ids": [], "example_ids": []} else: result = { "input_ids": [], "token_type_ids": [], "labels": [], "example_ids": [] } for idx in range(num_examples): candidate = 0 options = examples.data['candidates'][idx] # Each content may have several sentences. for context in examples.data['content'][idx]: context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \ replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'") context = _tokenize_chinese_chars(context) paragraph_text = context.strip() doc_tokens = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): if '#idiom' in token: sub_tokens = [str(token)] else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: all_doc_tokens.append(sub_token) tags = [blank for blank in doc_tokens if '#idiom' in blank] # Each sentence may have several tags for tag_index, tag in enumerate(tags): pos = all_doc_tokens.index(tag) tmp_l, tmp_r = add_tokens_for_around( all_doc_tokens, pos, num_tokens) num_l = len(tmp_l) num_r = len(tmp_r) tokens_l = [] for token in tmp_l: if '#idiom' in token and token != tag: # Mask tag which is not considered in this new sample. # Each idiom has four words, so 4 mask tokens are used. tokens_l.extend(['[MASK]'] * 4) else: tokens_l.append(token) tokens_l = tokens_l[-num_l:] del tmp_l tokens_r = [] for token in tmp_r: if '#idiom' in token and token != tag: tokens_r.extend(['[MASK]'] * 4) else: tokens_r.append(token) tokens_r = tokens_r[:num_r] del tmp_r tokens_list = [] # Each tag has ten choices, and the shape of each new # example is [num_choices, seq_len] for i, elem in enumerate(options): option = tokenizer.tokenize(elem) tokens = option + ['[SEP]'] + tokens_l + ['[unused1]' ] + tokens_r tokens_list.append(tokens) new_data = tokenizer(tokens_list, is_split_into_words=True) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(new_data["input_ids"]) result["token_type_ids"].append(new_data["token_type_ids"]) result["example_ids"].append(idx) if not do_predict: label = examples.data["answers"][idx]["candidate_id"][ candidate] result["labels"].append(label) candidate += 1 if (idx + 1) % 10000 == 0: logger.info("%d samples have been processed." % (idx + 1)) return result if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "chid", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names with main_process_first(desc="train dataset map pre-processing"): train_ds = train_ds.map( partial(preprocess_function), batched=True, batch_size=len(train_ds), num_proc=args.num_proc, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset") batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id ), # input 'token_type_ids': Pad( axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64"), # label 'example_ids': Stack(dtype="int64"), # example id }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) with main_process_first(desc="evaluate dataset map pre-processing"): dev_ds = dev_ds.map(partial(preprocess_function), batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=args.num_proc, load_from_cache_file=args.overwrite_cache, desc="Running tokenizer on validation dataset") dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps >= 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = nn.CrossEntropyLoss() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels, example_ids = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step >= num_training_steps: logger.info("best_result: %.2f" % (best_acc * 100)) return tic_eval = time.time() acc = evaluate(model, dev_data_loader) logger.info("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc if args.save_best_model: model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) logger.info("best_result: %.2f" % (best_acc * 100)) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial(preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=args.num_proc) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'example_ids': Stack(dtype="int64"), # example id }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) result = {} idx = 623377 preds = evaluate(model, test_data_loader, do_predict=True) for pred in preds: result["#idiom" + str(idx) + "#"] = pred idx += 1 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'chid11_predict.json'), "w") as writer: json.dump(result, writer, indent=2)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) set_seed(args) if paddle.distributed.get_rank() == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) for i, tokenized_example in enumerate(tokenized_examples): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_example["input_ids"] cls_index = input_ids.index(tokenizer.cls_token_id) # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offsets = tokenized_example['offset_mapping'] # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] answers = examples[sample_index]['answers'] answer_starts = examples[sample_index]['answer_starts'] # If no answers are given, set the cls_index as answer. if len(answer_starts) == 0: tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Start/end character index of the answer in the text. start_char = answer_starts[0] end_char = start_char + len(answers[0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 2 while sequence_ids[token_end_index] != 1: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples[i]["start_positions"] = cls_index tokenized_examples[i]["end_positions"] = cls_index tokenized_examples[i]['answerable_label'] = 0 else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples[i][ "start_positions"] = token_start_index - 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples[i]["end_positions"] = token_end_index + 1 tokenized_examples[i]['answerable_label'] = 1 return tokenized_examples if args.do_train: assert args.train_file != None, "--train_file should be set when training!" train_ds = DuReaderChecklist().read(args.train_file) train_ds.map(prepare_train_features, batched=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64"), "answerable_label": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs if paddle.distributed.get_rank() == 0: dev_count = paddle.fluid.core.get_cuda_device_count() print("Device count: %d" % dev_count) print("Num train examples: %d" % len(train_ds.data)) print("Max train steps: %d" % num_training_steps) lr_scheduler = LinearDecayWithWarmup( args.learning_rate, num_training_steps, args.warmup_proportion) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) criterion = CrossEntropyLossForChecklist() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, start_positions, end_positions, answerable_label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = criterion(logits, (start_positions, end_positions,answerable_label)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_gradients() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. contexts = [examples[i]['context'] for i in range(len(examples))] questions = [examples[i]['question'] for i in range(len(examples))] tokenized_examples = tokenizer( questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # For validation, there is no need to compute start and end positions for i, tokenized_example in enumerate(tokenized_examples): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_example['token_type_ids'] # One example can give several spans, this is the index of the example containing this span of text. sample_index = tokenized_example['overflow_to_sample'] tokenized_examples[i]["example_id"] = examples[sample_index]['id'] # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples[i]["offset_mapping"] = [ (o if sequence_ids[k] == 1 else None) for k, o in enumerate(tokenized_example["offset_mapping"]) ] return tokenized_examples if args.do_pred: input_files = [] assert args.predict_file != None, "--predict_file should be set when predicting!" for input_pattern in args.predict_file: input_files.extend(glob.glob(input_pattern)) assert len(input_files) > 0, 'Can not find predict_file {}'.format(args.predict_file) for input_file in input_files: print('Run prediction on {}'.format(input_file)) prefix = os.path.basename(input_file) prefix = re.sub('.json', '', prefix) dev_ds = DuReaderChecklist().read(input_file) dev_ds.map(prepare_validation_features, batched=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) if paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, prefix=prefix)
def run(args): max_seq_length = args.max_seq_length max_num_choices = 4 def preprocess_function(examples, do_predict=False): def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length): """Truncates a sequence tuple in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer # sequence one token at a time. This makes more sense than # truncating an equal percent of tokens from each, since if one # sequence is very short then each token that's truncated likely # contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) + len(tokens_c) if total_length <= max_length: break if len(tokens_a) >= len(tokens_b) and len(tokens_a) >= len( tokens_c): tokens_a.pop() elif len(tokens_b) >= len(tokens_a) and len(tokens_b) >= len( tokens_c): tokens_b.pop() else: tokens_c.pop() num_examples = len(examples.data["question"]) if do_predict: result = {"input_ids": [], "token_type_ids": []} else: result = {"input_ids": [], "token_type_ids": [], "labels": []} for idx in range(num_examples): text = '\n'.join(examples.data["context"][idx]).lower() question = examples.data["question"][idx].lower() choice_list = examples.data["choice"][idx] choice_list = [choice.lower() for choice in choice_list] if not do_predict: answer = examples.data["answer"][idx].lower() label = choice_list.index(answer) tokens_t = tokenizer.tokenize(text) tokens_q = tokenizer.tokenize(question) tokens_t_list = [] tokens_c_list = [] # Pad each new example for axis=1, [batch_size, num_choices, seq_len] while len(choice_list) < max_num_choices: choice_list.append('无效答案') for choice in choice_list: tokens_c = tokenizer.tokenize(choice.lower()) _truncate_seq_tuple(tokens_t, tokens_q, tokens_c, max_seq_length - 4) tokens_c = tokens_q + ["[SEP]"] + tokens_c tokens_t_list.append(tokens_t) tokens_c_list.append(tokens_c) new_data = tokenizer(tokens_t_list, text_pair=tokens_c_list, is_split_into_words=True) # Pad each new example for axis=2 of [batch_size, num_choices, seq_len], # because length of each choice could be different. input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)( new_data["input_ids"]) token_type_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)( new_data["token_type_ids"]) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(input_ids) result["token_type_ids"].append(token_type_ids) if not do_predict: result["labels"].append([label]) if (idx + 1) % 1000 == 0: print(idx + 1, "samples have been processed.") return result paddle.set_device(args.device) set_seed(args) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "c3", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names train_ds = train_ds.map(preprocess_function, batched=True, batch_size=len(train_ds), num_proc=1, remove_columns=column_names) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64") # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = dev_ds.map(preprocess_function, batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, 0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, label) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step + 1, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() tic_eval = time.time() acc = evaluate(model, loss_fct, dev_data_loader, metric) print("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print("best_acc: ", best_acc) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial(preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=1) # Serveral samples have more than four choices. test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=1, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) f = open(os.path.join(args.output_dir, "c311_predict.json"), 'w') result = {} idx = 0 for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1).numpy().tolist() for pred in preds: result[str(idx)] = pred idx += 1 j = json.dumps({"id": idx, "label": pred}) f.write(j + "\n")
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) if args.version_2_with_negative: train_examples = load_dataset('squad_v2', split='train') dev_examples = load_dataset('squad_v2', split='validation') else: train_examples = load_dataset('squad', split='train') dev_examples = load_dataset('squad', split='validation') set_seed(args) if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) column_names = train_examples.column_names if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.do_train: train_ds = train_examples.map(partial(prepare_train_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), 'attention_mask': Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) loss = criterion(logits, (start_positions, end_positions)) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch + 1, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break if args.do_predict and rank == 0: dev_ds = dev_examples.map(partial(prepare_validation_features, tokenizer=tokenizer, args=args), batched=True, remove_columns=column_names, num_proc=4) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "attention_mask": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) evaluate(model, dev_data_loader, dev_examples, args)
def create_train_dataloader(args): ''' 构建用于训练的dataloader Create dataset, tokenizer and dataloader. input: args: 配置文件提供的参数借口 return: train_data_loader:训练数据data loader valid_data_loader:验证数据data loader ''' # 加载dataset train_ds, valid_ds = load_dataset('TEDTalk', splits=('train', 'dev'), lazy=False) label_list = train_ds.label_list label_num = len(label_list) # no_entity_id = label_num - 1 no_entity_id = 0 print(label_list) # 构建dataloader model_name_or_path = args.model_name_or_path tokenizer = ElectraTokenizer.from_pretrained(model_name_or_path) trans_func = partial(tokenize_and_align_labels, tokenizer=tokenizer, no_entity_id=no_entity_id, max_seq_len=args.max_seq_length) train_ds = train_ds.map(trans_func) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype='int32'), # input 'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype='int32' ), # segment 'seq_len': Stack(dtype='int64'), # seq_len 'labels': Pad(axis=0, pad_val=args.ignore_label, dtype='int64') # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) valid_ds = valid_ds.map(trans_func) valid_data_loader = DataLoader(dataset=valid_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # 测试 # for index,data in enumerate(train_data_loader): # # print(len(data)) # print(index) # print(data) # break return train_data_loader, valid_data_loader