def from_local_dir(): print('From_local_dir:--------------------------') # From local dir path tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bert')) print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bart')) print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = AutoTokenizer.from_pretrained(('saved_model/my_bigbird')) print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
def initialize(self, args): """`initialize` is called only once when the model is being loaded. Implementing `initialize` function is optional. This function allows the model to intialize any state associated with this model. Parameters ---------- args : dict Both keys and values are strings. The dictionary keys and values are: * model_config: A JSON string containing the model configuration * model_instance_kind: A string containing model instance kind * model_instance_device_id: A string containing model instance device ID * model_repository: Model repository path * model_version: Model version * model_name: Model name """ self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh", use_faster=True) # You must parse model_config. JSON string is not parsed here self.model_config = json.loads(args['model_config']) print("model_config:", self.model_config) self.input_names = [] for input_config in self.model_config["input"]: self.input_names.append(input_config["name"]) print("input:", self.input_names) self.output_names = [] self.output_dtype = [] for output_config in self.model_config["output"]: self.output_names.append(output_config["name"]) dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) self.output_dtype.append(dtype) print("output:", self.output_names)
def __init__(self, args): if not isinstance(args.device, six.string_types): print( ">>> [InferBackend] The type of device must be string, but the type you set is: ", type(device)) exit(0) args.device = args.device.lower() if args.device not in ['cpu', 'gpu']: print( ">>> [InferBackend] The device must be cpu or gpu, but your device is set to:", type(args.device)) exit(0) self.task_name = args.task_name self.tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_faster=True) if args.task_name == 'seq_cls': self.label_names = [] self.preprocess = self.seq_cls_preprocess self.postprocess = self.seq_cls_postprocess self.printer = seq_cls_print_ret elif args.task_name == 'token_cls': self.label_names = [ 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC' ] self.preprocess = self.token_cls_preprocess self.postprocess = self.token_cls_postprocess self.printer = token_cls_print_ret else: print( "[ErniePredictor]: task_name only support seq_cls and token_cls now." ) exit(0) self.max_seq_length = args.max_seq_length if args.device == 'cpu': args.use_fp16 = False args.set_dynamic_shape = False args.batch_size = 32 args.shape_info_file = None if args.device == 'gpu': args.num_threads = cpu_count() args.use_quantize = False self.inference_backend = InferBackend( args.model_path, batch_size=args.batch_size, device=args.device, use_fp16=args.use_fp16, use_quantize=args.use_quantize, set_dynamic_shape=args.set_dynamic_shape, shape_info_file=args.shape_info_file, num_threads=args.num_threads) if args.set_dynamic_shape: # If set_dynamic_shape is turned on, all required dynamic shapes will be automatically set according to the batch_size and max_seq_length. self.set_dynamic_shape(args.max_seq_length, args.batch_size) exit(0)
def do_eval(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] dev_ds = load_dataset('clue', args.task_name, splits='dev') tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, label_list=dev_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if dev_ds.label_list else "float32") # label ): fn(samples) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if dev_ds.label_list == None else len(dev_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() model.eval() metric.reset() for batch in dev_data_loader: input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) correct = metric.compute(logits, labels) metric.update(correct) res = metric.accumulate() print("acc: %s\n, " % (res), end='')
def from_community_models(): print('From_community_models:-------------------') # From community-contributed pretrained models tokenizer = AutoTokenizer.from_pretrained( 'yingyibiao/bert-base-uncased-sst-2-finetuned') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer.save_pretrained('saved_tokenizer/community_bert_auto') tokenizer = BertTokenizer.from_pretrained( 'yingyibiao/bert-base-uncased-sst-2-finetuned') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer.save_pretrained('saved_tokenizer/community_bert') # community without init_class tokenizer = AutoTokenizer.from_pretrained( 'junnyu/ckiplab-bert-base-chinese-ner') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = BertTokenizer.from_pretrained( 'junnyu/ckiplab-bert-base-chinese-ner') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
def do_predict(args): paddle.set_device(args.device) args.task_name = args.task_name.lower() train_ds, test_ds = load_dataset( 'clue', args.task_name, splits=('train', 'test')) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length, is_test=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment ): fn(samples) test_ds = test_ds.map(trans_func, lazy=True) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.task_name == 'ocnli': args.task_name = 'ocnli_50k' f = open( os.path.join(args.output_dir, args.task_name + "_predict.json"), 'w') for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1) for idx, pred in enumerate(preds): j = json.dumps({"id": idx, "label": train_ds.label_list[pred]}) f.write(j + "\n")
def init_op(self): from paddlenlp.transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-medium-zh", use_faster=True) # The label names of NER models trained by different data sets may be different self.label_names = [ 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC' ] # Output nodes may differ from model to model # You can see the output node name in the conf.prototxt file of serving_server self.fetch_names = [ "linear_113.tmp_1", ]
def from_built_in_models(): print('From_built_in_models:------------------') # From built-in pretrained models tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = AutoTokenizer.from_pretrained('ernie-ctm') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = AutoTokenizer.from_pretrained('plato-mini') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = UnifiedTransformerTokenizer.from_pretrained('plato-mini') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = AutoTokenizer.from_pretrained('bigbird-base-uncased') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")) tokenizer = BigBirdTokenizer.from_pretrained('bigbird-base-uncased') print(tokenizer("Welcome to use PaddlePaddle and PaddleNLP!"))
def seg(args): tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=True) seg_file( os.path.join(args.output_dir, args.data_split + ".txt.tmp"), tokenizer, args.max_len, ) seg_file( os.path.join(args.output_dir, args.data_split + "_box.txt.tmp"), tokenizer, args.max_len, ) seg_file( os.path.join(args.output_dir, args.data_split + "_image.txt.tmp"), tokenizer, args.max_len, )
def __init__(self, args): if not isinstance(args.device, six.string_types): print( ">>> [InferBackend] The type of device must be string, but the type you set is: ", type(device)) exit(0) if args.device not in ['cpu', 'gpu']: print( ">>> [InferBackend] The device must be cpu or gpu, but your device is set to:", type(args.device)) exit(0) self._tokenizer = AutoTokenizer.from_pretrained("ernie-3.0-base-zh", use_faster=True) self._position_prob = args.position_prob self._max_seq_len = args.max_seq_len self._schema_tree = None self.set_schema(args.schema) if args.device == 'cpu': args.use_fp16 = False self.inference_backend = InferBackend(args.model_path_prefix, device=args.device, use_fp16=args.use_fp16)
def __init__(self, args): self.task_name = args.task_name self.tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_faster=True) if args.task_name == 'seq_cls': self.label_names = [] self.preprocess = self.seq_cls_preprocess self.postprocess = self.seq_cls_postprocess self.printer = seq_cls_print_ret elif args.task_name == 'token_cls': self.label_names = [ 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC' ] self.preprocess = self.token_cls_preprocess self.postprocess = self.token_cls_postprocess self.printer = token_cls_print_ret else: print( "[ErniePredictor]: task_name only support seq_cls and token_cls now." ) exit(0) self.max_seq_length = args.max_seq_length self.inference_backend = InferBackend(args.model_path, args.use_fp16)
def do_eval(): tokenizer = AutoTokenizer.from_pretrained(args.model_path) model = UIE.from_pretrained(args.model_path) test_ds = load_dataset(reader, data_path=args.test_path, max_seq_len=args.max_seq_len, lazy=False) test_ds = test_ds.map( partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len)) test_batch_sampler = paddle.io.BatchSampler(dataset=test_ds, batch_size=args.batch_size, shuffle=False) test_data_loader = paddle.io.DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, return_list=True) metric = SpanEvaluator() precision, recall, f1 = evaluate(model, metric, test_data_loader) logger.info("Evaluation precision: %.5f, recall: %.5f, F1: %.5f" % (precision, recall, f1))
def do_train(): parser = PdArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # set_seed(args) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) # Use yaml config to rewrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") all_ds = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) label_list = getattr(all_ds['train'], "label_list", None) data_args.label_list = label_list data_args.ignore_label = -100 data_args.no_entity_id = len(data_args.label_list) - 1 num_classes = 1 if all_ds["train"].label_list == None else len( all_ds['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) class criterion(nn.Layer): def __init__(self): super(criterion, self).__init__() self.loss_fn = paddle.nn.loss.CrossEntropyLoss( ignore_index=data_args.ignore_label) def forward(self, *args, **kwargs): return paddle.mean(self.loss_fn(*args, **kwargs)) loss_fct = criterion() # Define dataset pre-process function trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector batchify_fn = ner_collator(tokenizer, data_args) # Dataset pre-process train_dataset = all_ds["train"].map(trans_fn) eval_dataset = all_ds["dev"].map(trans_fn) test_dataset = all_ds["test"].map(trans_fn) # Define the metrics of tasks. # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } trainer = Trainer( model=model, criterion=loss_fct, args=training_args, data_collator=batchify_fn, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Log model and data config trainer.print_config(model_args, "Model") trainer.print_config(data_args, "Data") checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] trainer.export_model(input_spec=input_spec, load_best_model=True, output_dir=model_args.export_model_dir)
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) if data_args.dataset in ALL_DATASETS: # if you custom you hyper-parameters in yaml config, it will overwrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) label_list = raw_datasets['train'].features['ner_tags'].feature.names data_args.label_list = label_list data_args.ignore_label = -100 data_args.no_entity_id = 0 num_classes = 1 if label_list == None else len(label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) class criterion(nn.Layer): def __init__(self): super(criterion, self).__init__() self.loss_fn = paddle.nn.loss.CrossEntropyLoss( ignore_index=data_args.ignore_label) def forward(self, *args, **kwargs): return paddle.mean(self.loss_fn(*args, **kwargs)) loss_fct = criterion() # Define dataset pre-process function trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector data_collator = DataCollatorForTokenClassification( tokenizer, label_pad_token_id=data_args.ignore_label) column_names = raw_datasets["train"].column_names # Dataset pre-process train_dataset = raw_datasets["train"].map(trans_fn, remove_columns=column_names) train_dataset.label_list = label_list eval_dataset = raw_datasets["test"].map(trans_fn, remove_columns=column_names) trainer = Trainer(model=model, criterion=loss_fct, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer) output_dir = os.path.join(model_args.model_name_or_path, "compress") if not os.path.exists(output_dir): os.makedirs(output_dir) compress_config = CompressConfig(quantization_config=PTQConfig( algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16])) trainer.compress(data_args.dataset, output_dir, pruning=True, quantization=True, compress_config=compress_config)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() raw_datasets = load_dataset(args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) train_ds = raw_datasets['train'] column_names = train_ds.column_names label_list = train_ds.features['ner_tags'].feature.names label_num = len(label_list) batchify_fn = DataCollatorForTokenClassification(tokenizer=tokenizer) # Define the model netword and its loss model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def tokenize_and_align_labels(examples, no_entity_id=0): tokenized_inputs = tokenizer( examples['tokens'], max_seq_len=args.max_seq_length, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_length=True) labels = [] for i, label in enumerate(examples['ner_tags']): label_ids = label if len(tokenized_inputs['input_ids'][i]) - 2 < len(label_ids): label_ids = label_ids[:len(tokenized_inputs['input_ids'][i]) - 2] label_ids = [no_entity_id] + label_ids + [no_entity_id] label_ids += [no_entity_id] * ( len(tokenized_inputs['input_ids'][i]) - len(label_ids)) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs test_ds = raw_datasets['test'] test_ds = test_ds.select(range(len(test_ds) - 1)) test_ds = test_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) test_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) if args.do_train: train_ds = train_ds.select(range(len(train_ds) - 1)) train_ds = train_ds.map(tokenize_and_align_labels, batched=True, remove_columns=column_names) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True, drop_last=True) train_data_loader = DataLoader(dataset=train_ds, collate_fn=batchify_fn, num_workers=0, batch_sampler=train_batch_sampler, return_list=True) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=label_list) global_step = 0 best_f1 = 0.0 last_step = args.num_train_epochs * len(train_data_loader) tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 logits = model(batch['input_ids'], batch['token_type_ids']) loss = loss_fct(logits, batch['labels']) avg_loss = paddle.mean(loss) if global_step % args.logging_steps == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, epoch, step, avg_loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() avg_loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 or global_step == num_training_steps: if paddle.distributed.get_rank() == 0: f1 = evaluate(model, loss_fct, metric, test_data_loader, label_num, "test") if f1 > best_f1: best_f1 = f1 output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: print("best_f1: ", best_f1) return print("best_f1: ", best_f1) if args.do_eval: eval_data_loader = DataLoader(dataset=test_ds, collate_fn=batchify_fn, num_workers=0, batch_size=args.batch_size, return_list=True) # Define the model netword and its loss model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, num_classes=label_num) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = ChunkEvaluator(label_list=label_list) model.eval() metric.reset() for step, batch in enumerate(eval_data_loader): logits = model(batch["input_ids"], batch["token_type_ids"]) loss = loss_fct(logits, batch["labels"]) avg_loss = paddle.mean(loss) preds = logits.argmax(axis=2) num_infer_chunks, num_label_chunks, num_correct_chunks = metric.compute( batch["length"], preds, batch["labels"]) metric.update(num_infer_chunks.numpy(), num_label_chunks.numpy(), num_correct_chunks.numpy()) precision, recall, f1_score = metric.accumulate() print("eval loss: %f, precision: %f, recall: %f, f1: %f" % (avg_loss, precision, recall, f1_score))
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) data_args.dataset = data_args.dataset.strip() if data_args.dataset in ALL_DATASETS: # if you custom you hyper-parameters in yaml config, it will overwrite all args. config = ALL_DATASETS[data_args.dataset] logger.info("Over-writing training config by yaml config!") for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], splits=("train", "dev", "test")) data_args.label_list = getattr(raw_datasets['train'], "label_list", None) num_classes = 1 if raw_datasets["train"].label_list == None else len( raw_datasets['train'].label_list) criterion = paddle.nn.CrossEntropyLoss() # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) # Define dataset pre-process function if "clue" in data_args.dataset: trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args) else: trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector data_collator = DataCollatorWithPadding(tokenizer) train_dataset = raw_datasets["train"].map(trans_fn) eval_dataset = raw_datasets["dev"].map(trans_fn) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, criterion=criterion) output_dir = os.path.join(model_args.model_name_or_path, "compress") if not os.path.exists(output_dir): os.makedirs(output_dir) compress_config = CompressConfig(quantization_config=PTQConfig( algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16])) trainer.compress(data_args.dataset, output_dir, pruning=True, quantization=True, compress_config=compress_config)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) args.task_name = args.task_name.lower() metric_class = METRIC_CLASSES[args.task_name] train_ds, dev_ds = load_dataset( 'clue', args.task_name, splits=('train', 'dev')) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial( convert_example, label_list=train_ds.label_list, tokenizer=tokenizer, max_seq_length=args.max_seq_length) train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.batch_size, shuffle=False) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64" if train_ds.label_list else "float32") # label ): fn(samples) train_data_loader = DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_data_loader = DataLoader( dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 1 if train_ds.label_list == None else len(train_ds.label_list) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() metric = metric_class() best_acc = 0.0 global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, segment_ids, labels = batch logits = model(input_ids, segment_ids) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() acc = evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if acc > best_acc: best_acc = acc output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: print("best_acc: ", best_acc) return print("best_acc: ", best_acc)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds = load_dataset( read_text_pair, data_path=args.train_set_file,is_test=False, lazy=False) model_name_or_path='rocketqa-zh-dureader-query-encoder' pretrained_model = AutoModel.from_pretrained( model_name_or_path, hidden_dropout_prob=args.dropout, attention_probs_dropout_prob=args.dropout) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) trans_func = partial( convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # tilte_segment ): [data for data in fn(samples)] train_data_loader = create_dataloader( train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) model = SimCSE( pretrained_model, margin=args.margin, scale=args.scale, output_emb_size=args.output_emb_size) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) print("warmup from:{}".format(args.init_from_ckpt)) model = paddle.DataParallel(model) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): query_input_ids, query_token_type_ids, title_input_ids, title_token_type_ids = batch if(random.random()<0.2): title_input_ids,title_token_type_ids=query_input_ids,query_token_type_ids query_input_ids,query_token_type_ids=word_repetition(query_input_ids,query_token_type_ids,args.dup_rate) title_input_ids,title_token_type_ids=word_repetition(title_input_ids,title_token_type_ids,args.dup_rate) loss, kl_loss = model( query_input_ids=query_input_ids, title_input_ids=title_input_ids, query_token_type_ids=query_token_type_ids, title_token_type_ids=title_token_type_ids) loss = loss + kl_loss * args.rdrop_coef global_step += 1 if global_step % 10 == 0 and rank == 0: print( "global step %d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, 10 / (time.time() - tic_train))) tic_train = time.time() loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir) if args.max_steps > 0 and global_step >= args.max_steps: return save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) save_param_path = os.path.join(save_dir, 'model_state.pdparams') paddle.save(model.state_dict(), save_param_path) tokenizer.save_pretrained(save_dir)
# yapf: disable parser = argparse.ArgumentParser() parser.add_argument("--params_path", type=str, required=True, default='./checkpoint/model_50/model_state.pdparams', help="The path to model parameters to be loaded.") parser.add_argument("--output_path", type=str, default='./output', help="The path of model parameter in static graph to be saved.") args = parser.parse_args() # yapf: enable if __name__ == "__main__": # If you want to use ernie1.0 model, plesace uncomment the following code output_emb_size = 256 pretrained_model = AutoModel.from_pretrained("ernie-1.0") tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') model = SimCSE(pretrained_model, output_emb_size=output_emb_size) if args.params_path and os.path.isfile(args.params_path): state_dict = paddle.load(args.params_path) model.set_dict(state_dict) print("Loaded parameters from %s" % args.params_path) model.eval() # Convert to static graph with specific input description model = paddle.jit.to_static( model, input_spec=[ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None],
def run(args): max_seq_length = args.max_seq_length max_num_choices = 4 def preprocess_function(examples, do_predict=False): def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length): """Truncates a sequence tuple in place to the maximum length.""" # This is a simple heuristic which will always truncate the longer # sequence one token at a time. This makes more sense than # truncating an equal percent of tokens from each, since if one # sequence is very short then each token that's truncated likely # contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) + len(tokens_c) if total_length <= max_length: break if len(tokens_a) >= len(tokens_b) and len(tokens_a) >= len( tokens_c): tokens_a.pop() elif len(tokens_b) >= len(tokens_a) and len(tokens_b) >= len( tokens_c): tokens_b.pop() else: tokens_c.pop() num_examples = len(examples.data["question"]) if do_predict: result = {"input_ids": [], "token_type_ids": []} else: result = {"input_ids": [], "token_type_ids": [], "labels": []} for idx in range(num_examples): text = '\n'.join(examples.data["context"][idx]).lower() question = examples.data["question"][idx].lower() choice_list = examples.data["choice"][idx] choice_list = [choice.lower() for choice in choice_list] if not do_predict: answer = examples.data["answer"][idx].lower() label = choice_list.index(answer) tokens_t = tokenizer.tokenize(text) tokens_q = tokenizer.tokenize(question) tokens_t_list = [] tokens_c_list = [] # Pad each new example for axis=1, [batch_size, num_choices, seq_len] while len(choice_list) < max_num_choices: choice_list.append('无效答案') for choice in choice_list: tokens_c = tokenizer.tokenize(choice.lower()) _truncate_seq_tuple(tokens_t, tokens_q, tokens_c, max_seq_length - 4) tokens_c = tokens_q + ["[SEP]"] + tokens_c tokens_t_list.append(tokens_t) tokens_c_list.append(tokens_c) new_data = tokenizer(tokens_t_list, text_pair=tokens_c_list, is_split_into_words=True) # Pad each new example for axis=2 of [batch_size, num_choices, seq_len], # because length of each choice could be different. input_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)( new_data["input_ids"]) token_type_ids = Pad(axis=0, pad_val=tokenizer.pad_token_id)( new_data["token_type_ids"]) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(input_ids) result["token_type_ids"].append(token_type_ids) if not do_predict: result["labels"].append([label]) if (idx + 1) % 1000 == 0: print(idx + 1, "samples have been processed.") return result paddle.set_device(args.device) set_seed(args) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "c3", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names train_ds = train_ds.map(preprocess_function, batched=True, batch_size=len(train_ds), num_proc=1, remove_columns=column_names) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64") # label }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = dev_ds.map(preprocess_function, batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, 0) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, label = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, label) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step + 1, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() tic_eval = time.time() acc = evaluate(model, loss_fct, dev_data_loader, metric) print("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) print("best_acc: ", best_acc) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial(preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=1) # Serveral samples have more than four choices. test_batch_sampler = paddle.io.BatchSampler(test_ds, batch_size=1, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) f = open(os.path.join(args.output_dir, "c311_predict.json"), 'w') result = {} idx = 0 for step, batch in enumerate(test_data_loader): input_ids, segment_ids = batch with paddle.no_grad(): logits = model(input_ids, segment_ids) preds = paddle.argmax(logits, axis=1).numpy().tolist() for pred in preds: result[str(idx)] = pred idx += 1 j = json.dumps({"id": idx, "label": pred}) f.write(j + "\n")
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) resource_file_urls = MODEL_MAP[args.model]['resource_file_urls'] logger.info("Downloading resource files...") for key, val in resource_file_urls.items(): file_path = os.path.join(args.model, key) if not os.path.exists(file_path): get_path_from_url(val, args.model) tokenizer = AutoTokenizer.from_pretrained(args.model) model = UIE.from_pretrained(args.model) train_ds = load_dataset(reader, data_path=args.train_path, max_seq_len=args.max_seq_len, lazy=False) dev_ds = load_dataset(reader, data_path=args.dev_path, max_seq_len=args.max_seq_len, lazy=False) train_ds = train_ds.map( partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len)) dev_ds = dev_ds.map( partial(convert_example, tokenizer=tokenizer, max_seq_len=args.max_seq_len)) train_batch_sampler = paddle.io.BatchSampler(dataset=train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True) dev_batch_sampler = paddle.io.BatchSampler(dataset=dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, return_list=True) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) optimizer = paddle.optimizer.AdamW(learning_rate=args.learning_rate, parameters=model.parameters()) criterion = paddle.nn.BCELoss() metric = SpanEvaluator() loss_list = [] global_step = 0 best_step = 0 best_f1 = 0 tic_train = time.time() for epoch in range(1, args.num_epochs + 1): for batch in train_data_loader: input_ids, token_type_ids, att_mask, pos_ids, start_ids, end_ids = batch start_prob, end_prob = model(input_ids, token_type_ids, att_mask, pos_ids) start_ids = paddle.cast(start_ids, 'float32') end_ids = paddle.cast(end_ids, 'float32') loss_start = criterion(start_prob, start_ids) loss_end = criterion(end_prob, end_ids) loss = (loss_start + loss_end) / 2.0 loss.backward() optimizer.step() optimizer.clear_grad() loss_list.append(float(loss)) global_step += 1 if global_step % args.logging_steps == 0 and rank == 0: time_diff = time.time() - tic_train loss_avg = sum(loss_list) / len(loss_list) logger.info( "global step %d, epoch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, epoch, loss_avg, args.logging_steps / time_diff)) tic_train = time.time() if global_step % args.valid_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) precision, recall, f1 = evaluate(model, metric, dev_data_loader) logger.info( "Evaluation precision: %.5f, recall: %.5f, F1: %.5f" % (precision, recall, f1)) if f1 > best_f1: logger.info( f"best F1 performence has been updated: {best_f1:.5f} --> {f1:.5f}" ) best_f1 = f1 save_dir = os.path.join(args.save_dir, "model_best") model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) tic_train = time.time()
def do_train(): parser = PdArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # set_seed(args) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) # Use yaml config to rewrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], cache_dir=model_args.cache_dir) label_list = getattr(raw_datasets['train'], "label_list", None) data_args.label_list = label_list # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path) loss_fct = CrossEntropyLossForSQuAD() train_dataset = raw_datasets["train"] eval_examples = raw_datasets["validation"] predict_examples = raw_datasets["test"] column_names = raw_datasets["train"].column_names # Dataset pre-process train_dataset = train_dataset.map( partial(prepare_train_features, tokenizer=tokenizer, args=data_args), batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) eval_dataset = eval_examples.map( partial(prepare_validation_features, tokenizer=tokenizer, args=data_args), batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) predict_dataset = predict_examples.map( partial(prepare_validation_features, tokenizer=tokenizer, args=data_args), batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Define data collector data_collator = qa_collator(tokenizer, data_args) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, all_nbest_json, scores_diff_json = compute_prediction( examples=examples, features=features, predictions=predictions, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, ) # Format the result to the format the metric expects. formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex["answers"] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Define the metrics of tasks. # Metrics metric = load_metric("squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) trainer = QuestionAnsweringTrainer( model=model, criterion=loss_fct, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, eval_examples=eval_examples, data_collator=data_collator, post_process_function=post_processing_function, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Log model and data config trainer.print_config(model_args, "Model") trainer.print_config(data_args, "Data") checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) test_ret = trainer.predict(predict_dataset, predict_examples) trainer.log_metrics("predict", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] trainer.export_model(input_spec=input_spec, load_best_model=True, output_dir=model_args.export_model_dir)
def do_train(): parser = PdArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # set_seed(args) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) # Use yaml config to rewrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) data_args.label_list = getattr(raw_datasets['train'], "label_list", None) num_classes = 1 if raw_datasets["train"].label_list == None else len( raw_datasets['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) loss_fct = nn.loss.CrossEntropyLoss( ) if data_args.label_list else nn.loss.MSELoss() # Define dataset pre-process function if "clue" in data_args.dataset: trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args) else: trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector batchify_fn = defaut_collator(tokenizer, data_args) # Dataset pre-process train_dataset = raw_datasets["train"].map(trans_fn) eval_dataset = raw_datasets["dev"].map(trans_fn) test_dataset = raw_datasets["test"].map(trans_fn) # Define the metrics of tasks. def compute_metrics(p): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = paddle.to_tensor(preds) label = paddle.to_tensor(p.label_ids) probs = F.softmax(preds, axis=1) metric = Accuracy() metric.reset() result = metric.compute(preds, label) metric.update(result) accu = metric.accumulate() metric.reset() return {"accuracy": accu} trainer = Trainer( model=model, criterion=loss_fct, args=training_args, data_collator=batchify_fn, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Log model and data config trainer.print_config(model_args, "Model") trainer.print_config(data_args, "Data") checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] trainer.export_model(input_spec=input_spec, load_best_model=True, output_dir=model_args.export_model_dir)
parser.add_argument("--hnsw_m", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_ef", default=100, type=int, help="Recall number for each query from Ann index.") parser.add_argument("--hnsw_max_elements", default=1000000, type=int, help="Recall number for each query from Ann index.") parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.") args = parser.parse_args() # yapf: enable if __name__ == "__main__": paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model_name_or_path = 'rocketqa-zh-dureader-query-encoder' tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) trans_func = partial(convert_example_test, tokenizer=tokenizer, max_seq_length=args.max_seq_length) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # text_input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # text_segment ): [data for data in fn(samples)] pretrained_model = AutoModel.from_pretrained(model_name_or_path) model = SimCSE(pretrained_model, output_emb_size=args.output_emb_size) model = paddle.DataParallel(model)
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") paddle.set_device(training_args.device) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) data_args.dataset = data_args.dataset.strip() dataset_config = data_args.dataset.split(" ") print(dataset_config) raw_datasets = load_dataset( dataset_config[0], name=None if len(dataset_config) <= 1 else dataset_config[1], splits=('train', 'dev')) data_args.label_list = getattr(raw_datasets['train'], "label_list", None) num_classes = 1 if raw_datasets["train"].label_list == None else len( raw_datasets['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) criterion = nn.loss.CrossEntropyLoss( ) if data_args.label_list else nn.loss.MSELoss() # Define dataset pre-process function trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector data_collator = DataCollatorWithPadding(tokenizer) # Dataset pre-process if training_args.do_train: train_dataset = raw_datasets["train"].map(trans_fn) if training_args.do_eval: eval_dataset = raw_datasets["dev"].map(trans_fn) if training_args.do_predict: test_dataset = raw_datasets["test"].map(trans_fn) # Define the metrics of tasks. def compute_metrics(p): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = paddle.to_tensor(preds) label = paddle.to_tensor(p.label_ids) probs = F.softmax(preds, axis=1) metric = Accuracy() metric.reset() result = metric.compute(preds, label) metric.update(result) accu = metric.accumulate() metric.reset() return {"accuracy": accu} trainer = Trainer( model=model, criterion=criterion, args=training_args, data_collator=data_collator, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, compute_metrics=compute_metrics, ) checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training if training_args.do_train: train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model if training_args.do_eval: eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) if training_args.do_predict: test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model if training_args.do_export: # You can also load from certain checkpoint # trainer.load_state_dict_from_checkpoint("/path/to/checkpoint/") input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] if model_args.export_model_dir is None: model_args.export_model_dir = os.path.join( training_args.output_dir, "export") paddlenlp.transformers.export_model(model=trainer.model, input_spec=input_spec, path=model_args.export_model_dir)
def do_train(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length) if args.task_type == "cross-lingual-transfer": train_ds = load_dataset("xnli", "en", splits="train") train_ds = train_ds.map(trans_func, lazy=True) elif args.task_type == "translate-train-all": all_train_ds = [] for language in all_languages: train_ds = load_dataset("xnli", language, splits="train") all_train_ds.append(train_ds.map(trans_func, lazy=True)) train_ds = XnliDataset(all_train_ds) train_batch_sampler = DistributedBatchSampler(train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # input_ids Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64" ), # position_ids Pad(axis=0, pad_val=0, dtype="int64"), # attention_mask Stack(dtype="int64") # labels ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) num_classes = 3 model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes, dropout=args.dropout) n_layers = model.ernie_m.config['num_hidden_layers'] if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = math.ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] # Construct dict name_dict = dict() for n, p in model.named_parameters(): name_dict[p.name] = n optimizer = AdamWDL(learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, n_layers=n_layers, layerwise_decay=args.layerwise_decay, apply_decay_param_fun=lambda x: x in decay_params, name_dict=name_dict) loss_fct = nn.CrossEntropyLoss() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) metric = Accuracy() global_step = 0 tic_train = time.time() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, position_ids, attention_mask, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"]): logits = model(input_ids, position_ids, attention_mask) loss = loss_fct(logits, labels) if args.use_amp: scaled_loss = scaler.scale(loss) scaled_loss.backward() scaler.minimize(optimizer, scaled_loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: for language in all_languages: tic_eval = time.time() test_data_loader = get_test_dataloader( args, language, batchify_fn, trans_func) evaluate(model, loss_fct, metric, test_data_loader, language) print("eval done total : %s s" % (time.time() - tic_eval)) if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_ft_model_%d.pdparams" % (global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step >= num_training_steps: break if global_step >= num_training_steps: break if paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
def main(): parser = PdArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) data_args.dataset = data_args.dataset.strip() if data_args.dataset in ALL_DATASETS: # if you custom you hyper-parameters in yaml config, it will overwrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] # Log model and data config training_args.print_config(model_args, "Model") training_args.print_config(data_args, "Data") dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], cache_dir=model_args.cache_dir) label_list = getattr(raw_datasets['train'], "label_list", None) data_args.label_list = label_list # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path) loss_fct = CrossEntropyLossForSQuAD() # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names column_names = raw_datasets["validation"].column_names train_dataset = raw_datasets["train"] # Create train feature from dataset with training_args.main_process_first( desc="train dataset map pre-processing"): # Dataset pre-process train_dataset = train_dataset.map( partial( prepare_train_features, tokenizer=tokenizer, args=data_args), batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) eval_examples = raw_datasets["validation"] with training_args.main_process_first( desc="evaluate dataset map pre-processing"): eval_dataset = eval_examples.map( partial( prepare_validation_features, tokenizer=tokenizer, args=data_args), batched=True, num_proc=4, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) # Define data collector data_collator = DataCollatorWithPadding(tokenizer) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, all_nbest_json, scores_diff_json = compute_prediction( examples=examples, features=features, predictions=predictions, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, ) references = [{ "id": ex["id"], "answers": ex["answers"] } for ex in examples] return EvalPrediction(predictions=predictions, label_ids=references) trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, eval_examples=eval_examples, data_collator=data_collator, post_process_function=post_processing_function, tokenizer=tokenizer) output_dir = os.path.join(model_args.model_name_or_path, "compress") if not os.path.exists(output_dir): os.makedirs(output_dir) prune = True compress_config = CompressConfig(quantization_config=PTQConfig( algo_list=['hist', 'mse'], batch_size_list=[4, 8, 16])) trainer.compress( data_args.dataset, output_dir, pruning=prune, quantization=True, compress_config=compress_config)
def run(args): if args.do_train: assert args.batch_size % args.gradient_accumulation_steps == 0, \ "Please make sure argmument `batch_size` must be divisible by `gradient_accumulation_steps`." paddle.set_device(args.device) set_seed(args) max_seq_length = args.max_seq_length max_num_choices = 10 def preprocess_function(examples, do_predict=False): SPIECE_UNDERLINE = '▁' def _is_chinese_char(cp): if ((cp >= 0x4E00 and cp <= 0x9FFF) or # (cp >= 0x3400 and cp <= 0x4DBF) or # (cp >= 0x20000 and cp <= 0x2A6DF) or # (cp >= 0x2A700 and cp <= 0x2B73F) or # (cp >= 0x2B740 and cp <= 0x2B81F) or # (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or # (cp >= 0x2F800 and cp <= 0x2FA1F)): # return True return False def is_fuhao(c): if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ or c == '‘' or c == '’': return True return False def _tokenize_chinese_chars(text): """Adds whitespace around any CJK character.""" output = [] is_blank = False for index, char in enumerate(text): cp = ord(char) if is_blank: output.append(char) if context[index - 12:index + 1].startswith("#idiom"): is_blank = False output.append(SPIECE_UNDERLINE) else: if text[index:index + 6] == "#idiom": is_blank = True if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) elif _is_chinese_char(cp) or is_fuhao(char): if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: output.append(SPIECE_UNDERLINE) output.append(char) output.append(SPIECE_UNDERLINE) else: output.append(char) return "".join(output) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F or c == SPIECE_UNDERLINE: return True return False def add_tokens_for_around(tokens, pos, num_tokens): num_l = num_tokens // 2 num_r = num_tokens - num_l if pos >= num_l and (len(tokens) - 1 - pos) >= num_r: tokens_l = tokens[pos - num_l:pos] tokens_r = tokens[pos + 1:pos + 1 + num_r] elif pos <= num_l: tokens_l = tokens[:pos] right_len = num_tokens - len(tokens_l) tokens_r = tokens[pos + 1:pos + 1 + right_len] elif (len(tokens) - 1 - pos) <= num_r: tokens_r = tokens[pos + 1:] left_len = num_tokens - len(tokens_r) tokens_l = tokens[pos - left_len:pos] else: raise ValueError('impossible') return tokens_l, tokens_r max_tokens_for_doc = max_seq_length - 3 num_tokens = max_tokens_for_doc - 5 num_examples = len(examples.data["candidates"]) if do_predict: result = {"input_ids": [], "token_type_ids": [], "example_ids": []} else: result = { "input_ids": [], "token_type_ids": [], "labels": [], "example_ids": [] } for idx in range(num_examples): candidate = 0 options = examples.data['candidates'][idx] # Each content may have several sentences. for context in examples.data['content'][idx]: context = context.replace("“", "\"").replace("”", "\"").replace("——", "--"). \ replace("—", "-").replace("―", "-").replace("…", "...").replace("‘", "\'").replace("’", "\'") context = _tokenize_chinese_chars(context) paragraph_text = context.strip() doc_tokens = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False all_doc_tokens = [] for (i, token) in enumerate(doc_tokens): if '#idiom' in token: sub_tokens = [str(token)] else: sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: all_doc_tokens.append(sub_token) tags = [blank for blank in doc_tokens if '#idiom' in blank] # Each sentence may have several tags for tag_index, tag in enumerate(tags): pos = all_doc_tokens.index(tag) tmp_l, tmp_r = add_tokens_for_around( all_doc_tokens, pos, num_tokens) num_l = len(tmp_l) num_r = len(tmp_r) tokens_l = [] for token in tmp_l: if '#idiom' in token and token != tag: # Mask tag which is not considered in this new sample. # Each idiom has four words, so 4 mask tokens are used. tokens_l.extend(['[MASK]'] * 4) else: tokens_l.append(token) tokens_l = tokens_l[-num_l:] del tmp_l tokens_r = [] for token in tmp_r: if '#idiom' in token and token != tag: tokens_r.extend(['[MASK]'] * 4) else: tokens_r.append(token) tokens_r = tokens_r[:num_r] del tmp_r tokens_list = [] # Each tag has ten choices, and the shape of each new # example is [num_choices, seq_len] for i, elem in enumerate(options): option = tokenizer.tokenize(elem) tokens = option + ['[SEP]'] + tokens_l + ['[unused1]' ] + tokens_r tokens_list.append(tokens) new_data = tokenizer(tokens_list, is_split_into_words=True) # Final shape of input_ids: [batch_size, num_choices, seq_len] result["input_ids"].append(new_data["input_ids"]) result["token_type_ids"].append(new_data["token_type_ids"]) result["example_ids"].append(idx) if not do_predict: label = examples.data["answers"][idx]["candidate_id"][ candidate] result["labels"].append(label) candidate += 1 if (idx + 1) % 10000 == 0: logger.info("%d samples have been processed." % (idx + 1)) return result if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, num_choices=max_num_choices) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) train_ds, dev_ds, test_ds = load_dataset( "clue", "chid", split=["train", "validation", "test"]) if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) column_names = train_ds.column_names with main_process_first(desc="train dataset map pre-processing"): train_ds = train_ds.map( partial(preprocess_function), batched=True, batch_size=len(train_ds), num_proc=args.num_proc, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset") batchify_fn = lambda samples, fn=Dict( { 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id ), # input 'token_type_ids': Pad( axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'labels': Stack(dtype="int64"), # label 'example_ids': Stack(dtype="int64"), # example id }): fn(samples) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_data_loader = paddle.io.DataLoader( dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) with main_process_first(desc="evaluate dataset map pre-processing"): dev_ds = dev_ds.map(partial(preprocess_function), batched=True, batch_size=len(dev_ds), remove_columns=column_names, num_proc=args.num_proc, load_from_cache_file=args.overwrite_cache, desc="Running tokenizer on validation dataset") dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_data_loader = paddle.io.DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps >= 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=grad_clip) loss_fct = nn.CrossEntropyLoss() model.train() global_step = 0 best_acc = 0.0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, segment_ids, labels, example_ids = batch logits = model(input_ids=input_ids, token_type_ids=segment_ids) loss = loss_fct(logits, labels) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: logger.info( "global step %d/%d, epoch: %d, batch: %d, loss: %.5f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step >= num_training_steps: logger.info("best_result: %.2f" % (best_acc * 100)) return tic_eval = time.time() acc = evaluate(model, dev_data_loader) logger.info("eval acc: %.5f, eval done total : %s s" % (acc, time.time() - tic_eval)) if paddle.distributed.get_rank() == 0 and acc > best_acc: best_acc = acc if args.save_best_model: model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) logger.info("best_result: %.2f" % (best_acc * 100)) if args.do_predict: column_names = test_ds.column_names test_ds = test_ds.map(partial(preprocess_function, do_predict=True), batched=True, batch_size=len(test_ds), remove_columns=column_names, num_proc=args.num_proc) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) batchify_fn = lambda samples, fn=Dict({ 'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id), # input 'token_type_ids': Pad(axis=1, pad_val=tokenizer.pad_token_type_id), # segment 'example_ids': Stack(dtype="int64"), # example id }): fn(samples) test_data_loader = paddle.io.DataLoader( dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=batchify_fn, return_list=True) result = {} idx = 623377 preds = evaluate(model, test_data_loader, do_predict=True) for pred in preds: result["#idiom" + str(idx) + "#"] = pred idx += 1 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) with open(os.path.join(args.output_dir, 'chid11_predict.json'), "w") as writer: json.dump(result, writer, indent=2)
def run(args): paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() rank = paddle.distributed.get_rank() tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) set_seed(args) train_examples, dev_examples, test_examples = load_dataset( 'cmrc2018', split=["train", "validation", "test"]) column_names = train_examples.column_names if rank == 0: if os.path.exists(args.model_name_or_path): print("init checkpoint from %s" % args.model_name_or_path) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path) if paddle.distributed.get_world_size() > 1: model = paddle.DataParallel(model) def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. # NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HugggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples['answers'][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != 1: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != 1: token_end_index -= 1 token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. #NOTE: Almost the same functionality as HuggingFace's prepare_train_features function. The main difference is # that HuggingFace uses ArrowTable as basic data structure, while we use list of dictionary instead. contexts = examples['context'] questions = examples['question'] tokenized_examples = tokenizer(questions, contexts, stride=args.doc_stride, max_seq_len=args.max_seq_length, return_attention_mask=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples['token_type_ids'][i] context_index = 1 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if args.do_train: args.batch_size = int(args.batch_size / args.gradient_accumulation_steps) train_ds = train_examples.map(prepare_train_features, batched=True, remove_columns=column_names, num_proc=1) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) train_batchify_fn = lambda samples, fn=Dict( { "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id), "start_positions": Stack(dtype="int64"), "end_positions": Stack(dtype="int64") }): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=train_batchify_fn, return_list=True) dev_ds = dev_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) dev_batch_sampler = paddle.io.BatchSampler( dev_ds, batch_size=args.eval_batch_size, shuffle=False) dev_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=dev_batchify_fn, return_list=True) num_training_steps = int( args.max_steps / args.gradient_accumulation_steps) if args.max_steps > 0 else int( len(train_data_loader) * args.num_train_epochs / args.gradient_accumulation_steps) lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = CrossEntropyLossForSQuAD() global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): for step, batch in enumerate(train_data_loader): input_ids, token_type_ids, start_positions, end_positions = batch logits = model(input_ids=input_ids, token_type_ids=token_type_ids) loss = criterion(logits, (start_positions, end_positions)) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s" % (global_step, num_training_steps, epoch, step + 1, loss, args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: if rank == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print('Saving checkpoint to:', output_dir) if global_step == num_training_steps: break evaluate(model, dev_examples, dev_data_loader, args) if args.do_predict and rank == 0: test_ds = test_examples.map(prepare_validation_features, batched=True, remove_columns=column_names, num_proc=1) test_batch_sampler = paddle.io.BatchSampler( test_ds, batch_size=args.eval_batch_size, shuffle=False) test_batchify_fn = lambda samples, fn=Dict({ "input_ids": Pad(axis=0, pad_val=tokenizer.pad_token_id), "token_type_ids": Pad(axis=0, pad_val=tokenizer.pad_token_type_id) }): fn(samples) test_data_loader = DataLoader(dataset=test_ds, batch_sampler=test_batch_sampler, collate_fn=test_batchify_fn, return_list=True) evaluate(model, test_examples, test_data_loader, args, do_eval=False)
def do_train(): paddle.set_device(args.device) rank = paddle.distributed.get_rank() if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() set_seed(args.seed) train_ds, dev_ds, test_ds = load_dataset(args.dataset, splits=["train", "dev", "test"]) model = AutoModelForSequenceClassification.from_pretrained( 'ernie-1.0', num_classes=len(train_ds.label_list)) tokenizer = AutoTokenizer.from_pretrained('ernie-1.0') trans_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length, is_pair=args.dataset == "xnli_cn") batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id), # segment Stack(dtype="int64") # label ): [data for data in fn(samples)] train_data_loader = create_dataloader(train_ds, mode='train', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) dev_data_loader = create_dataloader(dev_ds, mode='dev', batch_size=args.batch_size, batchify_fn=batchify_fn, trans_fn=trans_func) if args.init_from_ckpt and os.path.isfile(args.init_from_ckpt): state_dict = paddle.load(args.init_from_ckpt) model.set_dict(state_dict) model = paddle.DataParallel(model) num_training_steps = len(train_data_loader) * args.epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_proportion) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) criterion = paddle.nn.loss.CrossEntropyLoss() metric = paddle.metric.Accuracy() if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) global_step = 0 tic_train = time.time() for epoch in range(1, args.epochs + 1): for step, batch in enumerate(train_data_loader, start=1): input_ids, token_type_ids, labels = batch with paddle.amp.auto_cast( args.use_amp, custom_white_list=["layer_norm", "softmax", "gelu"], ): logits = model(input_ids, token_type_ids) loss = criterion(logits, labels) probs = F.softmax(logits, axis=1) correct = metric.compute(probs, labels) metric.update(correct) acc = metric.accumulate() if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() global_step += 1 if global_step % args.logging_steps == 0 and rank == 0: time_diff = time.time() - tic_train print( "global step %d, epoch: %d, batch: %d, loss: %.5f, accuracy: %.5f, speed: %.2f step/s" % (global_step, epoch, step, loss, acc, args.logging_steps / time_diff)) tic_train = time.time() if global_step % args.valid_steps == 0 and rank == 0: evaluate(model, criterion, metric, dev_data_loader) tic_train = time.time() if global_step % args.save_steps == 0 and rank == 0: save_dir = os.path.join(args.save_dir, "model_%d" % global_step) if not os.path.exists(save_dir): os.makedirs(save_dir) model._layers.save_pretrained(save_dir) tokenizer.save_pretrained(save_dir) tic_train = time.time()