def do_train(): parser = PdArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # set_seed(args) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) # Use yaml config to rewrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") raw_datasets = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) data_args.label_list = getattr(raw_datasets['train'], "label_list", None) num_classes = 1 if raw_datasets["train"].label_list == None else len( raw_datasets['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) loss_fct = nn.loss.CrossEntropyLoss( ) if data_args.label_list else nn.loss.MSELoss() # Define dataset pre-process function if "clue" in data_args.dataset: trans_fn = partial(clue_trans_fn, tokenizer=tokenizer, args=data_args) else: trans_fn = partial(seq_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector batchify_fn = defaut_collator(tokenizer, data_args) # Dataset pre-process train_dataset = raw_datasets["train"].map(trans_fn) eval_dataset = raw_datasets["dev"].map(trans_fn) test_dataset = raw_datasets["test"].map(trans_fn) # Define the metrics of tasks. def compute_metrics(p): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = paddle.to_tensor(preds) label = paddle.to_tensor(p.label_ids) probs = F.softmax(preds, axis=1) metric = Accuracy() metric.reset() result = metric.compute(preds, label) metric.update(result) accu = metric.accumulate() metric.reset() return {"accuracy": accu} trainer = Trainer( model=model, criterion=loss_fct, args=training_args, data_collator=batchify_fn, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Log model and data config trainer.print_config(model_args, "Model") trainer.print_config(data_args, "Data") checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] trainer.export_model(input_spec=input_spec, load_best_model=True, output_dir=model_args.export_model_dir)
def do_train(): parser = PdArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() paddle.set_device(training_args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, " + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # set_seed(args) data_args.dataset = data_args.dataset.strip() if data_args.dataset not in ALL_DATASETS: raise ValueError("Not found dataset {}".format(data_args.dataset)) # Use yaml config to rewrite all args. config = ALL_DATASETS[data_args.dataset] for args in (model_args, data_args, training_args): for arg in vars(args): if arg in config.keys(): setattr(args, arg, config[arg]) training_args.per_device_train_batch_size = config["batch_size"] training_args.per_device_eval_batch_size = config["batch_size"] dataset_config = data_args.dataset.split(" ") all_ds = load_dataset( dataset_config[0], None if len(dataset_config) <= 1 else dataset_config[1], ) label_list = getattr(all_ds['train'], "label_list", None) data_args.label_list = label_list data_args.ignore_label = -100 data_args.no_entity_id = len(data_args.label_list) - 1 num_classes = 1 if all_ds["train"].label_list == None else len( all_ds['train'].label_list) # Define tokenizer, model, loss function. tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, num_classes=num_classes) class criterion(nn.Layer): def __init__(self): super(criterion, self).__init__() self.loss_fn = paddle.nn.loss.CrossEntropyLoss( ignore_index=data_args.ignore_label) def forward(self, *args, **kwargs): return paddle.mean(self.loss_fn(*args, **kwargs)) loss_fct = criterion() # Define dataset pre-process function trans_fn = partial(ner_trans_fn, tokenizer=tokenizer, args=data_args) # Define data collector batchify_fn = ner_collator(tokenizer, data_args) # Dataset pre-process train_dataset = all_ds["train"].map(trans_fn) eval_dataset = all_ds["dev"].map(trans_fn) test_dataset = all_ds["test"].map(trans_fn) # Define the metrics of tasks. # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } trainer = Trainer( model=model, criterion=loss_fct, args=training_args, data_collator=batchify_fn, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, compute_metrics=compute_metrics, ) # Log model and data config trainer.print_config(model_args, "Model") trainer.print_config(data_args, "Data") checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint # Training train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluate and tests model eval_metrics = trainer.evaluate() trainer.log_metrics("eval", eval_metrics) test_ret = trainer.predict(test_dataset) trainer.log_metrics("test", test_ret.metrics) if test_ret.label_ids is None: paddle.save( test_ret.predictions, os.path.join(training_args.output_dir, "test_results.pdtensor"), ) # export inference model input_spec = [ paddle.static.InputSpec(shape=[None, None], dtype="int64"), # input_ids paddle.static.InputSpec(shape=[None, None], dtype="int64") # segment_ids ] trainer.export_model(input_spec=input_spec, load_best_model=True, output_dir=model_args.export_model_dir)