def test_auto_config(self): config = AutoConfig.from_pretrained(self.model_name) self.assertIsInstance(config, MarianConfig)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically") try: import tensorflow_datasets as tfds except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False) if training_args.do_train else None ) eval_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=True) if training_args.do_eval else None ) else: processor = SquadV2Processor() if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples(data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples(data_args.data_dir) if training_args.do_eval else None train_dataset = ( squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None ) train_dataset = train_dataset.apply(tf.data.experimental.assert_cardinality(len(train_examples))) eval_dataset = ( squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None ) eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples))) # Initialize our Trainer trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Data collator data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = ( f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1] ) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement" ) # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select(range(data_args.max_test_samples)) test_dataset = test_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] true_labels = [ [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=DATA_FOLDER_PATH, type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--task_name", default=None, type=str, help="agnews, nyt_coarse, 20news_coarse, dbpedia", ) parser.add_argument( "--model_name_or_path", default=None, # roberta-base type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--train_suffix", default=None, type=str, required=True, help= "Evaluation language. Also train language if `train_language` is set to None.", ) parser.add_argument( "--test_suffix", default=None, type=str, help="Train language if is different of the evaluation language.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_filter", action="store_true", help="Whether to do filtering.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare XNLI task if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name](task=args.task_name, train_suffix=args.train_suffix, test_suffix=args.test_suffix) args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir, ) args.model_type = config.model_type tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False, filter_flag=args.do_filter, model=model) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForSequenceClassification.from_pretrained( args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelForSequenceClassification.from_pretrained( checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.train_file is not None or data_args.validation_file is not None: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Downloading and loading the swag dataset from the hub. raw_datasets = load_dataset( "swag", "regular", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # When using your own dataset or a different dataset from swag, you will probably need to change this. ending_names = [f"ending{i}" for i in range(4)] context_name = "sent1" question_header_name = "sent2" if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Preprocessing the datasets. def preprocess_function(examples): first_sentences = [[context] * 4 for context in examples[context_name]] question_headers = examples[question_header_name] second_sentences = [ [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) ] # Flatten out first_sentences = list(chain(*first_sentences)) second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( first_sentences, second_sentences, truncation=True, max_length=max_seq_length, padding="max_length" if data_args.pad_to_max_length else False, ) # Un-flatten return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = ( default_data_collator if data_args.pad_to_max_length else DataCollatorForMultipleChoice(tokenizer=tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) ) # Metric def compute_metrics(eval_predictions): predictions, label_ids = eval_predictions preds = np.argmax(predictions, axis=1) return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()} # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = dict( finetuned_from=model_args.model_name_or_path, tasks="multiple-choice", dataset_tags="swag", dataset_args="regular", dataset="SWAG", language="en", ) if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: if data_args.mlm and data_args.whole_word_mask: data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def Difficulty_Evaluation(args, train_dataset): """ 用来对数据集进行难度进行划分,将teacher的f1分数用作难度衡量 :param args: 划分成 n 个subset :param train_dataset: 全部数据集 """ logger.info("开始DE难度评估") if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() train_sampler_total = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) total_train_dataloader = DataLoader(train_dataset, sampler=train_sampler_total, batch_size=args.train_batch_size) subset_quantity = args.div_subset args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.diff_model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, # 输出中间状态 output_hidden_states=True, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.diff_model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, use_fast=False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling ) phi_model = AutoModelForQuestionAnswering.from_pretrained( args.diff_model_name_or_path, from_tf=bool(".ckpt" in args.diff_model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, # output_hidden_states = True, ) phi_model.to(args.device) phi_model.eval() def Phi(Phi_batch): # 可以直接输入结果 phi_model.eval() with torch.no_grad(): inputs = { "input_ids": Phi_batch[0], "attention_mask": Phi_batch[1], "token_type_ids": Phi_batch[2], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert", "bart", "longformer" ]: del inputs["token_type_ids"] outputs = phi_model(**inputs) # print(type(outputs)) # 只看压缩情况 # print(len(outputs.hidden_states)) #13 # print(outputs.hidden_states[0].shape) # 48 384 768 #embedding # print(outputs.hidden_states[-1].shape) # 48 384 768 return outputs.hidden_states[0].to( args.device), outputs.hidden_states[-1].to(args.device) # 48 384 difficult_result = [] # notice 划分方法 method = "line" criterion = "wd" logger.info("划分方法 " + method + " " + criterion) for batch in tqdm(total_train_dataloader): phi_model.eval() batch = tuple(t.to(args.device) for t in batch) embedding, output = Phi(batch) difficult_result.append(cal_diff(embedding, output, method, criterion)) difficult_result = np.array(difficult_result) difficult_result_max = max(difficult_result) difficult_result_min = min(difficult_result) gap = difficult_result_max - difficult_result_min # subset_id = [] # for i in range(subset_quantity): # subset_id.append([]) # for i, batch in enumerate(total_train_dataloader): # if difficult_result[i] == difficult_result_max: # subset_id[-1].append(i) # continue # level = int(subset_quantity * (difficult_result[i] - difficult_result_min) / gap) # # subset_id[level].append(batch) # subset_id[level].append(i) subset = [] for i in range(subset_quantity): subset.append([]) for i, batch in enumerate(total_train_dataloader): if difficult_result[i] == difficult_result_max: subset[-1].append(batch) continue level = int(subset_quantity * (difficult_result[i] - difficult_result_min) / gap) subset[level].append(batch) logger.info("难度评估已完成") return subset
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written.", ) # CL_parameters parser.add_argument( "--div_subset", default=3, type=int, help="划分子集的数量", ) parser.add_argument( "--cuda_num", default=3, type=int, help="cuda用哪个", ) parser.add_argument( "--diff_model_name_or_path", default=None, type=str, required=True, help="用于难度衡量的模型,最好是经过训练的", ) # Other parameters parser.add_argument( "--data_dir", default=None, type=str, help="The input data dir. Should contain the .json files for the task." + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--train_file", default=None, type=str, help= "The input training file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--predict_file", default=None, type=str, help= "The input evaluation file. If a data dir is specified, will look for the file there" + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from huggingface.co", ) parser.add_argument( "--version_2_with_negative", action="store_true", help= "If true, the SQuAD examples contain some that do not have an answer.", ) parser.add_argument( "--null_score_diff_threshold", type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null.", ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks.", ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file.", ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.", ) parser.add_argument( "--verbose_logging", action="store_true", help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.", ) parser.add_argument( "--lang_id", default=0, type=int, help= "language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)", ) parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") parser.add_argument( "--threads", type=int, default=1, help="multiple threads for converting example to features") args = parser.parse_args() if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: # notice GPU 编号 # device = torch.device("cuda:{}".format(args.cuda_num) if torch.cuda.is_available() and not args.no_cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): # if is_main_process(args.local_rank): # transformers.utils.logging.set_verbosity_info() # transformers.utils.logging.enable_default_handler() # transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # set_seed(time.time()) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, # notice 输出中间状态 output_hidden_states=True, ) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, use_fast= False, # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling ) # train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) # Difficulty_Evaluation(args, train_dataset, model, tokenizer) model = AutoModelForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) # Difficulty_Evaluation(args, train_dataset, None, tokenizer) if args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(args.device) print(model) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): print("/n/n 真的真的在训练!!!/n/n") logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForQuestionAnswering.from_pretrained( args.output_dir) # , force_download=True) # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out tokenizer = AutoTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case, use_fast=False) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: logger.info( "Loading checkpoints saved during training for evaluation") checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) else: logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForQuestionAnswering.from_pretrained( checkpoint) # , force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict( (k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"): result = {} from transformers import __version__ as transformers_version if version.parse(transformers_version) < version.parse( "3.1.0"): # past_key_values name does not exist in 3.0.2 or older raise RuntimeError("This tool requires transformers 3.1.0 or later.") args = parse_arguments(argv) setup_logger(args.verbose) if not experiment_name: import sys experiment_name = " ".join(argv if argv else sys.argv[1:]) if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] logger.info(f"Arguments:{args}") cache_dir = args.cache_dir output_dir = args.output if not args.output.endswith( ".onnx") else os.path.dirname(args.output) prepare_environment(cache_dir, output_dir, args.use_gpu) if args.precision != Precision.FLOAT32: assert args.optimize_onnx, "fp16/int8 requires --optimize_onnx" if args.precision == Precision.FLOAT16: assert args.use_gpu, "fp16 requires --use_gpu" if args.precision == Precision.INT8: assert not args.use_gpu, "quantization only supports CPU" if args.use_external_data_format: assert not args.output.endswith( '.onnx' ), "output shall be a directory for --use_external_data_format" model_class = MODEL_CLASSES[args.model_class][0] use_padding = MODEL_CLASSES[args.model_class][2] if args.model_class == "GPT2LMHeadModel_BeamSearchStep": model_type = "beam_search_step" elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch": model_type = "configurable_one_step_search" else: model_type = "default" gpt2helper = Gpt2HelperFactory.create_helper(model_type) gpt2tester = Gpt2TesterFactory.create_tester(model_type) config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=cache_dir) if model_type == 'beam_search_step': model = model_class.from_pretrained(args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, cache_dir=cache_dir) elif model_type == 'configurable_one_step_search': model = model_class.from_pretrained( args.model_name_or_path, config=config, batch_size=1, beam_size=args.beam_size, ignore_eos=args.ignore_eos, temperature=args.temperature, repetition_penalty=args.repetition_penalty, excluded_token_ids=args.excluded_token_ids, length_penalty=args.length_penalty, do_sample=args.do_sample, do_sample_top_p=args.do_sample_top_p, do_sample_top_k=args.do_sample_top_k, cache_dir=cache_dir) else: model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=cache_dir) device = torch.device("cuda:0" if args.use_gpu else "cpu") model.eval().to(device) if (not args.use_external_data_format) and (config.n_layer > 24): logger.info(f"Try --use_external_data_format when model size > 2GB") onnx_model_paths = gpt2helper.get_onnx_paths( output_dir, args.model_name_or_path, args.model_class, new_folder=args.use_external_data_format, remove_existing=[ "fp32", "fp16", "int8" ]) # Do not remove raw model to save time in parity test raw_onnx_model = onnx_model_paths["raw"] if os.path.exists(raw_onnx_model): logger.warning( f"Skip exporting ONNX model since it existed: {raw_onnx_model}") else: logger.info(f"Exporting ONNX model to {raw_onnx_model}") gpt2helper.export_onnx(model, device, raw_onnx_model, args.verbose, args.use_external_data_format, has_position_ids=use_padding, has_attention_mask=use_padding) fp16_params = {"keep_io_types": args.keep_io_types} if args.io_block_list: fp16_params["keep_io_types"] = args.io_block_list if args.node_block_list: fp16_params["node_block_list"] = args.node_block_list if args.op_block_list: fp16_params["op_block_list"] = args.op_block_list if args.force_fp16_initializers: fp16_params["force_fp16_initializers"] = args.force_fp16_initializers is_io_float16 = (args.precision == Precision.FLOAT16 and not args.keep_io_types) if args.optimize_onnx or args.precision != Precision.FLOAT32: output_path = onnx_model_paths[str(args.precision) if args. precision != Precision.INT8 else 'fp32'] logger.info(f"Optimizing model to {output_path}") gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, args.use_external_data_format, **fp16_params) else: output_path = raw_onnx_model if args.precision == Precision.INT8: logger.info("quantizing model...") QuantizeHelper.quantize_onnx_model(output_path, onnx_model_paths['int8'], args.use_external_data_format) model = QuantizeHelper.quantize_torch_model(model) logger.info("finished quantizing model") output_path = onnx_model_paths['int8'] if args.output.endswith( '.onnx' ) and output_path != args.output and not args.use_external_data_format: import shutil shutil.move(output_path, args.output) output_path = args.output logger.info(f"Output path: {output_path}") model_size_in_MB = int( get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024) session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if args.model_class == "GPT2LMHeadModel" and session is not None: parity_result = gpt2helper.test_parity( session, model, device, is_io_float16, rtol=args.tolerance, atol=args.tolerance, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding, test_cases_per_run=args.test_cases, total_runs=args.test_runs, verbose=args.verbose) latency = gpt2helper.test_performance(session, model, device, is_io_float16, total_runs=100, use_io_binding=True, model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding, batch_size=8, sequence_length=1, past_sequence_length=32) if args.precision == Precision.FLOAT16: logger.info(f"fp16 conversion parameters:{fp16_params}") # Write results to file import csv from onnxruntime import __version__ as ort_version latency_name = get_latency_name() csv_file_existed = os.path.exists(csv_filename) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", "runs", "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers", "ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "top1_match_rate", "onnx_size_in_MB", "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate_per_run" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) if not csv_file_existed: csv_writer.writeheader() row = { "experiment": experiment_name, "run_id": run_id, "model_name": args.model_name_or_path, "model_class": args.model_class, "gpu": args.use_gpu, "precision": args.precision, "optimizer": args.optimize_onnx, "test_cases": args.test_cases, "runs": args.test_runs, "keep_io_types": args.keep_io_types, "io_block_list": args.io_block_list, "op_block_list": args.op_block_list, "node_block_list": args.node_block_list, "force_fp16_initializers": args.force_fp16_initializers, "ORT_TRANSFORMER_OPTIONS": os.getenv('ORT_TRANSFORMER_OPTIONS'), "ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'), "onnxruntime": ort_version, latency_name: f"{latency:.2f}", "diff_50_percentile": parity_result["max_diff_percentile_50"], "diff_90_percentile": parity_result["max_diff_percentile_90"], "diff_95_percentile": parity_result["max_diff_percentile_95"], "diff_99_percentile": parity_result["max_diff_percentile_99"], "diff_pass_rate": parity_result["diff_pass_rate"], "nan_rate": parity_result["nan_rate"], "top1_match_rate": parity_result["top1_match_rate"], "top1_match_rate_per_run": parity_result["top1_match_rate_per_run"], "onnx_size_in_MB": "{}".format(model_size_in_MB), } logger.info(f"result: {row}") result.update(row) csv_writer.writerow(row) if args.input_test_file: test_inputs = [] # Each line of test file is a JSON string like: # {"input_ids": [[14698, 257, 1310, 13688, 319, 326]]} with open(args.input_test_file) as read_f: for _, line in enumerate(read_f): line = line.rstrip() data = json.loads(line) input_ids = torch.from_numpy( numpy.asarray(data["input_ids"], dtype=numpy.int64)).to(device) if use_padding: if "attention_mask" in data: numpy_float = numpy.float16 if is_io_float16 else numpy.float32 attention_mask = torch.from_numpy( numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 attention_mask = (input_ids != padding).type( torch.float16 if is_io_float16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: position_ids = torch.from_numpy( numpy.asarray(data["position_ids"], dtype=numpy.int64)).to(device) else: position_ids = (attention_mask.long().cumsum(-1) - 1) position_ids.masked_fill_(position_ids < 0, 0) inputs = { "input_ids": input_ids, "position_ids": position_ids, "attention_mask": attention_mask } else: inputs = {"input_ids": input_ids} if model_type == "beam_search_step" or model_type == "configurable_one_step_search": beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long() input_log_probs = torch.zeros([input_ids.shape[0], 1]) input_unfinished_sents = torch.ones( [input_ids.shape[0], 1], dtype=torch.bool) inputs.update({ "beam_select_idx": beam_select_idx, "input_log_probs": input_log_probs, "input_unfinished_sents": input_unfinished_sents, }) test_inputs.append(inputs) gpt2tester.test_generation(session, model, device, test_inputs, precision=args.precision, model_class=args.model_class, top_k=20, top_k_no_order=True, max_steps=24, max_inputs=0, verbose=args.verbose, save_test_data=3, save_test_data_dir=Path(output_path).parent) logger.info(f"Done. Output model: {output_path}") return result
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) elif data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = {"train": data_args.train_file, "validation": data_args.validation_file} # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError("Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [name for name in datasets["train"].column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in datasets and "validation_matched" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None: if "test" not in datasets and "test_matched" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) else: metric = load_metric("accuracy") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids) ** 2).mean().item()} else: return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = ( data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) ) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] predict_datasets = [predict_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") predict_datasets.append(datasets["test_mismatched"]) for predict_dataset, task in zip(predict_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset.remove_columns_("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: logger.info(f"***** Predict results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") if training_args.push_to_hub: kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "text-classification"} if data_args.task_name is not None: kwargs["language"] = "en" kwargs["dataset_tags"] = "glue" kwargs["dataset_args"] = data_args.task_name kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" trainer.push_to_hub(**kwargs)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--output_dir", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.", ) # Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, help= "An optional input evaluation data file to evaluate the perplexity on (a text file).", ) parser.add_argument( "--line_by_line", action="store_true", help= "Whether distinct lines of text in the dataset are to be handled as distinct sequences.", ) parser.add_argument( "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir") parser.add_argument( "--model_name_or_path", default=None, type=str, help= "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) parser.add_argument( "--mlm", action="store_true", help= "Train with masked-language modeling loss instead of language modeling." ) parser.add_argument( "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument( "--config_name", default=None, type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", ) parser.add_argument( "--tokenizer_name", default=None, type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_ratio", type=float, default=0.1) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--save_total_limit", type=int, default=None, help= "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if args.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError( "Used --should_continue but no checkpoint was found in --output_dir." ) else: args.model_name_or_path = sorted_checkpoints[-1] if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir and not args.should_continue): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab if args.config_name: config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: # When we release a pip version exposing CONFIG_MAPPING, # we can do `config = CONFIG_MAPPING[args.model_type]()`. raise ValueError( "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --config_name") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if args.block_size <= 0: args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: args.block_size = min(args.block_size, tokenizer.max_len) if args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) if args.local_rank == 0: torch.distributed.barrier() global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelWithLMHead.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelWithLMHead.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def train(args): logger.debug('training function') set_seed(args) num_labels = args.num_labels output_mode = args.output_mode logger.debug('num_labels: {}'.format(num_labels)) logger.debug('output_mode: {}'.format(output_mode)) # Load pretrained model and tokenizer config = AutoConfig.from_pretrained( args.base_model_name, num_labels=num_labels, finetuning_task=args.base_job_name, ) tokenizer = AutoTokenizer.from_pretrained(args.base_model_name) model = AutoModelForSequenceClassification.from_pretrained( args.base_model_name, config=config) # from transformers import BertForSequenceClassification, AdamW, BertConfig # # Load BertForSequenceClassification, the pretrained BERT model with a single # # linear classification layer on top. # model = BertForSequenceClassification.from_pretrained( # "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. # num_labels = 2, # The number of output labels--2 for binary classification. # # You can increase this for multi-class tasks. # output_attentions = False, # Whether the model returns attentions weights. # output_hidden_states = False, # Whether the model returns all hidden-states. # ) from transformers import AdamW # Note: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) logger.debug('train set: {}'.format(args.train)) logger.debug('test set: {}'.format(args.test)) train_dataset, test_dataset = prepare_data_and_tokenize(args, tokenizer) # FIXME: this need to go into the SageMaker parts # If there's a GPU available... if torch.cuda.is_available(): # Tell pytorch to run this model on the GPU. model.cuda() # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") from torch.utils.data import DataLoader, RandomSampler, SequentialSampler # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch # size of 16 or 32. batch_size = args.batch_size # Create the DataLoaders for our training and validation sets. # We'll take training samples in random order. train_dataloader = DataLoader( train_dataset, # The training samples. sampler=RandomSampler(train_dataset), # Select batches randomly batch_size=batch_size # Trains with this batch size. ) # For validation the order doesn't matter, so we'll just read them sequentially. test_dataloader = DataLoader( test_dataset, # The validation samples. sampler=SequentialSampler( test_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] # Measure the total training time for the whole run. total_t0 = time.time() epochs = args.epochs from transformers import get_linear_schedule_with_warmup # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) # For each epoch... for epoch_i in range(0, epochs): # ======================================== # Training # ======================================== # Perform one full pass over the training set. print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_train_loss = 0 # Put the model into training mode. Don't be mislead--the call to # `train` just changes the *mode*, it doesn't *perform* the training. # `dropout` and `batchnorm` layers behave differently during training # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch) model.train() # For each batch of training data... for step, batch in enumerate(train_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using the # `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels #logger.debug('dataset: {}'.format(train_dataset[5])) b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Always clear any previously calculated gradients before performing a # backward pass. PyTorch doesn't do this automatically because # accumulating the gradients is "convenient while training RNNs". # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() # Perform a forward pass (evaluate the model on this training batch). # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # It returns different numbers of parameters depending on what arguments # arge given and what flags are set. For our useage here, it returns # the loss (because we provided labels) and the "logits"--the model # outputs prior to activation. loss, logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_train_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch took: {:}".format(training_time)) # ======================================== # Validation # ======================================== # After the completion of each training epoch, measure our performance on # our validation set. print("") print("Running Validation...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in test_dataloader: # Unpack this training batch from our dataloader. # # As we unpack the batch, we'll also copy each tensor to the GPU using # the `to` method. # # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) # Tell pytorch not to bother with constructing the compute graph during # the forward pass, since this is only needed for backprop (training). with torch.no_grad(): # Forward pass, calculate logit predictions. # token_type_ids is the same as the "segment ids", which # differentiates sentence 1 and 2 in 2-sentence tasks. # The documentation for this `model` function is here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification # Get the "logits" output by the model. The "logits" are the output # values prior to applying an activation function like the softmax. (loss, logits) = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) # Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += flat_accuracy(logits, label_ids) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(test_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(test_dataset) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time }) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) return (model, tokenizer)
# -*- coding: utf-8 -*- # @Time : 2021/1/29 15:34 # @Author : Jclian91 # @File : ernie_model_predict.py # @Place : Yangpu, Shanghai import json import torch import numpy as np from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification from params import * from model_train import convert_text_to_ids, seq_padding device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config = AutoConfig.from_pretrained("../ernie-1.0", num_labels=num_labels, hidden_dropout_prob=hidden_dropout_prob) model = AutoModelForSequenceClassification.from_pretrained("../ernie-1.0", config=config) model.to(device) state_dict = torch.load('{}_ernie_cls.pth'.format(dataset)) model.load_state_dict(state_dict) text = "F_A-18超级大黄蜂燃烧器黄昏!" text = "曾经在车迷圈子中盛传一句话,“传统品牌要是发力新能源,把特斯拉打得渣儿都不剩”。现在实际情况是,不仅仅是传统汽车品牌的纯电动车被打得节节败退,甚至Model 3(参数|图片)还强力侵蚀着本属于传统豪华品牌的汽油车型市场。而就在前天,华晨宝马官方宣布旗下纯电动中型SUV-iX3官方降价7万元,在此之前订购的用户也可以联系经销商退还差价。那么和特斯拉对比显得更有诚意的做法,能不能打败订单已经爆表的Model Y呢?" tokenizer = AutoTokenizer.from_pretrained("../ernie-1.0") input_ids, token_type_ids = convert_text_to_ids(tokenizer, [text], max_sequence_length) # print(input_ids, token_type_ids) input_ids = seq_padding(tokenizer, input_ids)
parser.add_argument('--dataset', type=str, required=True) parser.add_argument('--model', type=str, required=True) parser.add_argument('--rationale-selection', type=str, required=True) parser.add_argument('--output', type=str, required=True) args = parser.parse_args() corpus = {doc['doc_id']: doc for doc in jsonlines.open(args.corpus)} dataset = jsonlines.open(args.dataset) rationale_selection = jsonlines.open(args.rationale_selection) output = jsonlines.open(args.output, 'w') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'Using device "{device}"') tokenizer = AutoTokenizer.from_pretrained(args.model) config = AutoConfig.from_pretrained(args.model, num_labels=3) model = AutoModelForSequenceClassification.from_pretrained( args.model, config=config).eval().to(device) LABELS = ['CONTRADICT', 'NOT_ENOUGH_INFO', 'SUPPORT'] def encode(sentences, claims): encoded_dict = tokenizer.batch_encode_plus(zip(sentences, claims), pad_to_max_length=True, return_tensors='pt') if encoded_dict['input_ids'].size(1) > 512: encoded_dict = tokenizer.batch_encode_plus( zip(sentences, claims), max_length=512, pad_to_max_length=True,
TEMP = "temp/" OUTPUT = "modelcpu/" parser = argparse.ArgumentParser() parser.add_argument('-c', '--config_file', default='config/bert_verify_config.json', help='model config file') # parser.add_argument( # '--local_rank', default=0, # help='used for distributed parallel') args = parser.parse_args() with open(args.config_file) as fin: config = json.load(fin, object_hook=lambda d: SimpleNamespace(**d)) bert_config = AutoConfig.from_pretrained(config.bert_model_path, cache_dir=TEMP) # main(args.config_file) WRAPPED_MODEL = AutoModelWithLMHead.from_pretrained( config.bert_model_path, from_tf=False, config=bert_config, cache_dir=TEMP, ) tokenizer = BertTokenizer.from_pretrained(config.bert_model_path) WRAPPED_MODEL.to('cpu') xm.save(WRAPPED_MODEL.state_dict(), os.path.join(OUTPUT, 'pytorch_model.bin')) # WRAPPED_MODEL.save_pretrained(OUTPUT) print('DONE')
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.warning( "device: %s, n_gpu: %s, 16-bits training: %s", training_args.device, training_args.n_gpu, training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( TFMultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( TFMultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset.get_dataset() if train_dataset else None, eval_dataset=eval_dataset.get_dataset() if eval_dataset else None, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def train(cfg): results_path = os.path.join('./results', cfg['train_id_name']) if not os.path.exists(results_path): os.mkdir(results_path) os.environ['WANDB_PROJECT'] = 'KLUE_PROJECT' os.environ['WANDB_LOG_MODEL'] = 'true' MODEL_NAME = cfg['model_name'] if MODEL_NAME == 'monologg/kobert': tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert') else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) with open('../input/data/train/ner_tags.pickle', 'rb') as f: ner_tokens = pickle.load(f) special_tokens_dict = {'additional_special_tokens': ner_tokens} tokenizer.add_special_tokens(special_tokens_dict) train_dataset = load_data(cfg['train_data_path']) dev_dataset = load_data(cfg['valid_data_path']) train_label = train_dataset['label'].values dev_label = dev_dataset['label'].values tokenized_train = tokenized_dataset(train_dataset, tokenizer) tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_label) RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 # model_config.vocab_size += len(ner_tokens) model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=model_config) model.resize_token_embeddings(len(tokenizer)) model.parameters model.to(device) training_configs = cfg['train_args'] training_args = TrainingArguments( output_dir=results_path, save_total_limit=training_configs['save_total_limit'], save_steps=training_configs['save_steps'], num_train_epochs=training_configs['num_train_epochs'], learning_rate=training_configs['learning_rate'], per_device_train_batch_size=training_configs['per_device_train_batch_size'], per_device_eval_batch_size=training_configs['per_device_eval_batch_size'], warmup_steps=training_configs['warmup_steps'], weight_decay=training_configs['weight_decay'], logging_dir=training_configs['logging_dir'], logging_steps=training_configs['logging_steps'], evaluation_strategy=training_configs['evaluation_strategy'], load_best_model_at_end=True, ) trainer = Trainer( model=model, args=training_args, train_dataset=RE_train_dataset, eval_dataset=RE_dev_dataset, compute_metrics=compute_metrics, callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=cfg['early_stopping_patience'],),] ) transformers.integrations.WandbCallback() print('Start Training.') trainer.train() print('Fininshed Training.')
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() raw_datasets["train"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name) raw_datasets["eval"] = load_dataset(data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name) if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") # prepare dataset if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[data_args.text_column_name]).lower() + " " else: batch["target_text"] = batch[ data_args.text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[data_args.text_column_name], desc="remove special characters from datasets", ) # 3. Next, we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created vocab_file = os.path.join(training_args.output_dir, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile(vocab_file): os.remove(vocab_file) with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(training_args.output_dir, exist_ok=True) vocab_dict = create_vocabulary_from_data(raw_datasets) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # 4. Now we can instantiate the configuration, feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load config config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # tokenizer is defined by `tokenizer_class` if present in config else by `model_type` config_for_tokenizer = config if config.tokenizer_class is not None else None tokenizer_type = config.model_type if config.tokenizer_class is None else None # load feature_extractor, tokenizer and create processor tokenizer = AutoTokenizer.from_pretrained( training_args.output_dir, config=config_for_tokenizer, tokenizer_type=tokenizer_type, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", use_auth_token=data_args.use_auth_token, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": processor.tokenizer.pad_token_id, "vocab_size": len(processor.tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_extractor: model.freeze_feature_extractor() # 5. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio(sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * processor.feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * processor.feature_extractor.sampling_rate # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[data_args.audio_column_name] batch["input_values"] = processor( sample["array"], sampling_rate=sample["sampling_rate"], truncate=True, max_length=max_input_length).input_values[0] batch["input_length"] = len(batch["input_values"]) # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=raw_datasets["train"].column_names, num_proc=data_args.preprocessing_num_workers, desc="preprocess datasets", ) if min_input_length > 0.0: # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( lambda x: x > min_input_length, num_proc=data_args.preprocessing_num_workers, input_columns=["input_length"], ) vectorized_datasets = vectorized_datasets.remove_columns( "input_length") # 6. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define Metric during training wer_metric = load_metric("wer") # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"wer": wer} # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # 7. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None # Save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split: {data_args.eval_split_name}", "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def __init__( self, labels: T.List[str], model_path: str, cache_dir: str, output_dir: T.Optional[str] = None, num_train_epochs: T.Optional[int] = None, train_file: T.Optional[str] = None, dev_file: T.Optional[str] = None, ): """. Args: labels: list of valid labels used in the dataset model_path: name of model being used or filepath to where the model is stored model_path_tokenizer: name or path of tokenizer being used. cache_dir: directory where cache & output are kept. num_train_epochs: obvious. Why? To make unit testing faster. train_file: Required if we want to do .train() dev_file:: Required if we want to do .train_and_evaluate() """ self.cache_dir = cache_dir self.model_path = model_path self.output_dir = output_dir self.num_train_epochs = num_train_epochs self.num_splits = 5 # self.num_train_epochs = 1 # self.num_splits = 3 self.labels = labels self.num_labels = len(labels) self.task_name = "classification" self.label_map = {label: i for i, label in enumerate(self.labels)} self.label_map_reverse = { i: label for i, label in enumerate(self.labels) } self.average_metrics = {} self.config = AutoConfig.from_pretrained( self.model_path, num_labels=self.num_labels, finetuning_task=self.task_name, cache_dir=self.cache_dir, ) self.tokenizer = AutoTokenizer.from_pretrained( self.model_path, cache_dir=self.cache_dir, ) self.model = AutoModelForSequenceClassification.from_pretrained( self.model_path, from_tf=False, config=self.config, cache_dir=self.cache_dir, ) if train_file is not None: self.train_dataset = self.make_dataset( train_file, Settings.CONTENT_COL, Settings.LABEL_COL, ) self.train_dataset_path = train_file else: self.train_dataset = None if dev_file is not None: self.eval_dataset = self.make_dataset( dev_file, Settings.CONTENT_COL, Settings.LABEL_COL, ) else: self.eval_dataset = None
def main(): # Parse the arguments args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( args.target_lang is not None and args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if args.source_lang is not None: tokenizer.src_lang = args.source_lang if args.target_lang is not None: tokenizer.tgt_lang = args.target_lang # Get the language codes for input/target. source_lang = args.source_lang.split("_")[0] target_lang = args.target_lang.split("_")[0] padding = "max_length" if args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) metric = load_metric("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes(batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens = accelerator.gather(generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]}) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file.split(".")[-1] if data_args.train_file is not None else data_args.validation_file.split(".")[-1]) if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warn( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=lm_datasets["train"] if training_args.do_train else None, eval_dataset=lm_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, ) train_dataset = lm_datasets["train"] eval_dataset = lm_datasets["validation"] if training_args.tune: def eval_func_for_lpot(model_tuned): trainer = Trainer( model=model_tuned, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=default_data_collator, ) eval_output = trainer.evaluate(eval_dataset=eval_dataset) perplexity = math.exp(eval_output["eval_loss"]) results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ "eval_samples_per_second":eval_output['eval_samples_per_second']} clm_task_metrics_keys = ["perplexity"] for key in clm_task_metrics_keys: if key in results.keys(): logger.info("Finally Eval {}:{}".format(key, results[key])) if key == "perplexity": perplexity = results[key] break return 100 - perplexity from lpot.experimental import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=training_args.eval_batch_size, collate_fn=default_data_collator_lpot) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(training_args.tuned_checkpoint) exit(0) if training_args.accuracy_only: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer = Trainer( model=new_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=default_data_collator, ) eval_output = trainer.evaluate(eval_dataset=eval_dataset) perplexity = math.exp(eval_output["eval_loss"]) results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ "eval_samples_per_second":eval_output['eval_samples_per_second']} clm_task_metrics_keys = ["perplexity"] for key in clm_task_metrics_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) if training_args.benchmark: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer = Trainer( model=new_model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=default_data_collator, ) eval_output = trainer.evaluate(eval_dataset=eval_dataset, iters=training_args.iters, warmup_iter=training_args.warmup_iter) perplexity = math.exp(eval_output["eval_loss"]) results = {"perplexity":perplexity,"eval_loss":eval_output["eval_loss"],\ "eval_samples_per_second":eval_output['eval_samples_per_second']} clm_task_metrics_keys = ["perplexity"] for key in clm_task_metrics_keys: if key in results.keys(): acc = results[key] break print("Accuracy: %.5f" % acc) print('Throughput: %.3f samples/sec' % (results["eval_samples_per_second"])) print('Latency: %.3f ms' % (1 * 1000 / results["eval_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity trainer.log_metrics("eval", results) trainer.save_metrics("eval", results) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) # Setup logging, we only want one process per machine to log things on the screen. logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR) if jax.process_index() == 0: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name( Path(training_args.output_dir).absolute().name, token=training_args.hub_token ) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. dataset = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, keep_in_memory=False ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = FlaxAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) ) else: model = FlaxAutoModelForSeq2SeqLM.from_config( config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype) ) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = dataset["train"].column_names elif training_args.do_eval: column_names = dataset["validation"].column_names elif training_args.do_predict: column_names = dataset["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length # In Flax, for seq2seq models we need to pass `decoder_input_ids` # as the Flax models don't accept `labels`, we need to prepare the decoder_input_ids here # for that dynamically import the `shift_tokens_right` function from the model file model_module = __import__(model.__module__, fromlist=["shift_tokens_tight"]) shift_tokens_right_fn = getattr(model_module, "shift_tokens_right") # Setting padding="max_length" as we need fixed length inputs for jitted functions def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer( inputs, max_length=data_args.max_source_length, padding="max_length", truncation=True, return_tensors="np" ) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer( targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np" ) model_inputs["labels"] = labels["input_ids"] decoder_input_ids = shift_tokens_right_fn( jnp.array(labels["input_ids"]), config.pad_token_id, config.decoder_start_token_id ) model_inputs["decoder_input_ids"] = np.asarray(decoder_input_ids) # We need decoder_attention_mask so we can ignore pad tokens from loss model_inputs["decoder_attention_mask"] = labels["attention_mask"] return model_inputs if training_args.do_train: if "train" not in dataset: raise ValueError("--do_train requires a train dataset") train_dataset = dataset["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in dataset: raise ValueError("--do_eval requires a validation dataset") eval_dataset = dataset["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in dataset: raise ValueError("--do_predict requires a test dataset") predict_dataset = dataset["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Metric metric = load_metric("rouge") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(preds, labels): decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable." ) # Initialize our training rng = jax.random.PRNGKey(training_args.seed) rng, dropout_rng = jax.random.split(rng) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count() steps_per_epoch = len(train_dataset) // train_batch_size total_train_steps = steps_per_epoch * num_epochs # Create learning rate schedule linear_decay_lr_schedule_fn = create_learning_rate_fn( len(train_dataset), train_batch_size, training_args.num_train_epochs, training_args.warmup_steps, training_args.learning_rate, ) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. # Note that this mask is specifically adapted for FlaxBart. # For FlaxT5, one should correct the layer norm parameter naming # accordingly - see `run_t5_mlm_flax.py` e.g. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) layer_norm_params = [ (name, "scale") for name in ["self_attn_layer_norm", "layernorm_embedding", "final_layer_norm"] ] flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_params) for path in flat_params} return traverse_util.unflatten_dict(flat_mask) # create adam optimizer adamw = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, eps=training_args.adam_epsilon, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng) # label smoothed cross entropy def loss_fn(logits, labels, padding_mask, label_smoothing_factor=0.0): """ The label smoothing implementation is adapted from Flax's official example: https://github.com/google/flax/blob/87a211135c6a377c8f29048a1cac3840e38b9da4/examples/wmt/train.py#L104 """ vocab_size = logits.shape[-1] confidence = 1.0 - label_smoothing_factor low_confidence = (1.0 - confidence) / (vocab_size - 1) normalizing_constant = -( confidence * jnp.log(confidence) + (vocab_size - 1) * low_confidence * jnp.log(low_confidence + 1e-20) ) soft_labels = onehot(labels, vocab_size, on_value=confidence, off_value=low_confidence) loss = optax.softmax_cross_entropy(logits, soft_labels) loss = loss - normalizing_constant # ignore padded tokens from loss loss = loss * padding_mask loss = loss.sum() / padding_mask.sum() return loss # Define gradient update step fn def train_step(state, batch, label_smoothing_factor=0.0): dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng) def compute_loss(params): labels = batch.pop("labels") logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor) return loss grad_fn = jax.value_and_grad(compute_loss) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng) metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)} metrics = jax.lax.pmean(metrics, axis_name="batch") return new_state, metrics # Define eval fn def eval_step(params, batch, label_smoothing_factor=0.0): labels = batch.pop("labels") logits = model(**batch, params=params, train=False)[0] loss = loss_fn(logits, labels, batch["decoder_attention_mask"], label_smoothing_factor) # summarize metrics metrics = {"loss": loss} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics # Define generation function max_length = ( data_args.val_max_target_length if data_args.val_max_target_length is not None else model.config.max_length ) num_beams = data_args.num_beams if data_args.num_beams is not None else model.config.num_beams gen_kwargs = {"max_length": max_length, "num_beams": num_beams} def generate_step(params, batch): model.params = params output_ids = model.generate(batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs) return output_ids.sequences # Create parallel version of the train and eval step p_train_step = jax.pmap( partial(train_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch", donate_argnums=(0,) ) p_eval_step = jax.pmap(partial(eval_step, label_smoothing_factor=training_args.label_smoothing_factor), "batch") p_generate_step = jax.pmap(generate_step, "batch") # Replicate the train state on each device state = state.replicate() logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {num_epochs}") logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel & distributed) = {train_batch_size}") logger.info(f" Total optimization steps = {total_train_steps}") train_time = 0 epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() # Create sampling rng rng, input_rng = jax.random.split(rng) train_metrics = [] # Generate an epoch by shuffling sampling indices from the train dataset train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True) steps_per_epoch = len(train_dataset) // train_batch_size # train for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False): batch = next(train_loader) state, train_metric = p_train_step(state, batch) train_metrics.append(train_metric) train_time += time.time() - train_start train_metric = unreplicate(train_metric) epochs.write( f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})" ) # ======================== Evaluating ============================== eval_metrics = [] eval_preds = [] eval_labels = [] eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size) eval_steps = len(eval_dataset) // eval_batch_size for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False): # Model forward batch = next(eval_loader) labels = batch["labels"] metrics = p_eval_step(state.params, batch) eval_metrics.append(metrics) # generation if data_args.predict_with_generate: generated_ids = p_generate_step(state.params, batch) eval_preds.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"]))) eval_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1]))) # normalize eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # compute ROUGE metrics rouge_desc = "" if data_args.predict_with_generate: rouge_metrics = compute_metrics(eval_preds, eval_labels) eval_metrics.update(rouge_metrics) rouge_desc = " ".join([f"Eval {key}: {value} |" for key, value in rouge_metrics.items()]) # Print metrics and update progress bar desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | {rouge_desc})" epochs.write(desc) epochs.desc = desc # Save metrics if has_tensorboard and jax.process_index() == 0: cur_step = epoch * (len(train_dataset) // train_batch_size) write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step) # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub(commit_message=f"Saving weights and logs of epoch {epoch}", blocking=False) # ======================== Prediction loop ============================== if training_args.do_predict: logger.info("*** Predict ***") pred_metrics = [] pred_generations = [] pred_labels = [] pred_loader = data_loader(input_rng, predict_dataset, eval_batch_size) pred_steps = len(predict_dataset) // eval_batch_size for _ in tqdm(range(pred_steps), desc="Predicting...", position=2, leave=False): # Model forward batch = next(pred_loader) labels = batch["labels"] metrics = p_eval_step(state.params, batch) pred_metrics.append(metrics) # generation if data_args.predict_with_generate: generated_ids = p_generate_step(state.params, batch) pred_generations.extend(jax.device_get(generated_ids.reshape(-1, gen_kwargs["max_length"]))) pred_labels.extend(jax.device_get(labels.reshape(-1, labels.shape[-1]))) # normalize prediction metrics pred_metrics = get_metrics(pred_metrics) pred_metrics = jax.tree_map(jnp.mean, pred_metrics) # compute ROUGE metrics rouge_desc = "" if data_args.predict_with_generate: rouge_metrics = compute_metrics(pred_generations, pred_labels) pred_metrics.update(rouge_metrics) rouge_desc = " ".join([f"Predict {key}: {value} |" for key, value in rouge_metrics.items()]) # Print metrics desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})" logger.info(desc) # save final metrics in json if jax.process_index() == 0: rouge_metrics = {f"test_{metric_name}": value for metric_name, value in rouge_metrics.items()} path = os.path.join(training_args.output_dir, "test_results.json") with open(path, "w") as f: json.dump(rouge_metrics, f, indent=4, sort_keys=True)
set_seed(config_dict["seed"]) path = config_dict["model_path"] method = config_dict["method"] output_path = config_dict["output_file"].format( model_name=config_dict["model_name"], dataset_name=config_dict["dataset"], experiment_name=config_dict["experiment_name"], datetime=datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S"), method=method, ) tokenizer = AutoTokenizer.from_pretrained(config_dict["model_name"], ) config = AutoConfig.from_pretrained(path) model = SeqClassModel.from_pretrained(path, config=config, params_dict=config_dict) labels = [model.config.id2label[0], model.config.id2label[1]] logger.info("labels_ids: " + str(labels)) model_args = ModelArguments(model_name_or_path=config_dict["model_name"]) data_args = datasets[config_dict["dataset"]] data_config = dict( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels,
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Loading data # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than # a single grammatical sentence, when the task requires it. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = {"train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file} data_files = {key: file for key, file in data_files.items() if file is not None} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.input_file_extension == "csv": # Loading a dataset from local csv files datasets = load_dataset( "csv", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Label preprocessing # If you've passed us a training set, we try to infer your labels from it if "train" in datasets: # By default we assume that if your label column looks like a float then you're doing regression, # and if not then you're doing classification. This is something you may want to change! is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # If you haven't passed a training set, we read label info from the saved model (this happens later) else: num_labels = None label_list = None is_regression = None # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path if num_labels is not None: config = AutoConfig.from_pretrained( config_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. column_names = {col for cols in datasets.column_names.values() for col in cols} non_label_column_names = [name for name in column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" elif "sentence1" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", None else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Ensure that our labels match the model's, if it has some pre-specified if "train" in datasets: if not is_regression and config.label2id != PretrainedConfig(num_labels=num_labels).label2id: label_name_to_id = config.label2id if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = label_name_to_id # Use the model's labels else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {v: i for i, v in enumerate(label_list)} elif not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} else: label_to_id = None # Now we've established our label2id, let's overwrite the model config with it. config.label2id = label_to_id if config.label2id is not None: config.id2label = {id: label for label, id in label_to_id.items()} else: config.id2label = None else: label_to_id = config.label2id # Just load the data from the model if "validation" in datasets and config.label2id is not None: validation_label_list = datasets["validation"].unique("label") for val_label in validation_label_list: assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!" def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, max_length=max_seq_length, truncation=True) # Map labels to IDs if config.label2id is not None and "label" in examples: result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion with training_args.strategy.scope(): # region Load pretrained model # Set seed before initializing model set_seed(training_args.seed) # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_val_samples, "test": data_args.max_test_samples, } for key in ("train", "validation", "test"): if key not in datasets: tf_data[key] = None continue if key in ("train", "validation"): assert "label" in datasets[key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = dataset.to_tf_dataset( columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, drop_remainder=drop_remainder, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion # region Training and validation if tf_data["train"] is not None: callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) elif tf_data["validation"] is not None: # If there's a validation dataset but no training set, just evaluate the metrics logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%") # endregion # region Prediction if tf_data["test"] is not None: logger.info("Doing predictions on test dataset...") predictions = model.predict(tf_data["test"])["logits"] predicted_class = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predicted_class): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = config.id2label[item] writer.write(f"{index}\t{item}\n") logger.info(f"Wrote predictions to {output_test_file}!") # endregion # region Prediction losses # This section is outside the scope() because it's very quick to compute, but behaves badly inside it if "test" in datasets and "label" in datasets["test"].features: print("Computing prediction loss on test labels...") labels = datasets["test"]["label"] loss = float(loss_fn(labels, predictions).numpy()) print(f"Test loss: {loss:.4f}")
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv" ) parser.add_argument( "--predict_file", default=None, type=str, required=True, help="SWAG csv for predictions. E.g., val.csv or test.csv", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pretrained model or model identifier from huggingface.co/models", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.", ) # Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--max_seq_length", default=384, type=int, help="The maximum total input sequence length after tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.") parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.") args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, ) model = AutoModelForMultipleChoice.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForMultipleChoice.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: if args.do_train: checkpoints = [args.output_dir] else: # if do_train is False and do_eval is true, load model directly from pretrained. checkpoints = [args.model_name_or_path] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForMultipleChoice.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items()) results.update(result) logger.info("Results: {}".format(results)) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup distant debugging if needed if data_args.server_ip and data_args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(data_args.server_ip, data_args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # In distributed training, the load_dataset function guarantees that only one local process can concurrently # download the dataset. # Downloading and loading xnli dataset from the hub. if training_args.do_train: if model_args.train_language is None: train_dataset = load_dataset("xnli", model_args.language, split="train", cache_dir=model_args.cache_dir) else: train_dataset = load_dataset("xnli", model_args.train_language, split="train", cache_dir=model_args.cache_dir) label_list = train_dataset.features["label"].names if training_args.do_eval: eval_dataset = load_dataset("xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir) label_list = eval_dataset.features["label"].names if training_args.do_predict: predict_dataset = load_dataset("xnli", model_args.language, split="test", cache_dir=model_args.cache_dir) label_list = predict_dataset.features["label"].names # Labels num_labels = len(label_list) # Load pretrained model and tokenizer # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task="xnli", cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, do_lower_case=model_args.do_lower_case, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Preprocessing the datasets # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False def preprocess_function(examples): # Tokenize the texts return tokenizer( examples["premise"], examples["hypothesis"], padding=padding, max_length=data_args.max_seq_length, truncation=True, ) if training_args.do_train: if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) with training_args.main_process_first( desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") if training_args.do_eval: if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) with training_args.main_process_first( desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) with training_args.main_process_first( desc="prediction dataset map pre-processing"): predict_dataset = predict_dataset.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Get the metric function metric = load_metric("xnli") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=p.label_ids) # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict( predict_dataset, metric_key_prefix="predict") max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n")
from generate_data import scispacy_plus_tokenizer from annotations import Entity, Relation from typing import List, Tuple BIOBERT_SEQ_LEN = 128 BILSTM_SEQ_LEN = 512 # =====BioBERT Model for NER====== biobert_ner_labels = get_labels('biobert_ner/dataset_two_ade/labels.txt') biobert_ner_label_map = {i: label for i, label in enumerate(biobert_ner_labels)} num_labels_ner = len(biobert_ner_labels) biobert_ner_config = AutoConfig.from_pretrained( 'biobert_ner/output_two_ade/config.json', num_labels=num_labels_ner, id2label=biobert_ner_label_map, label2id={label: i for i, label in enumerate(biobert_ner_labels)}) biobert_ner_tokenizer = AutoTokenizer.from_pretrained( "dmis-lab/biobert-base-cased-v1.1") biobert_ner_model = AutoModelForTokenClassification.from_pretrained( "biobert_ner/output_two_ade/pytorch_model.bin", config=biobert_ner_config) biobert_ner_training_args = TrainingArguments(output_dir="/tmp", do_predict=True) biobert_ner_trainer = Trainer(model=biobert_ner_model, args=biobert_ner_training_args) label_ent_map = {'DRUG': 'Drug', 'STR': 'Strength',
def train_func(config: Dict[str, Any]): args = config["args"] # Initialize the accelerator. We will let the accelerator handle device # placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on # the screen. accelerator.is_local_main_process is only True for one # process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON training and # evaluation files (see below) or specify a GLUE benchmark task (the # dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called # 'label' and as pair of sentences the sentences in columns called # 'sentence1' and 'sentence2' if such column exists or the first two # columns not named label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does # single sentence classification on this single column. You can easily # tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only # one local process can concurrently download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = (args.train_file if args.train_file is not None else args.valid_file).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your # needs. is_regression = raw_datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501 label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that # only one local process can concurrently download model & vocab. config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to # tweak to your use case. non_label_column_names = [ name for name in raw_datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and \ "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, # so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list( # noqa:C413 sorted(label_list)): # noqa:C413 logger.info( f"The configuration of the model provided the following label " f"correspondence: {label_name_to_id}. Using it!") label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, " "but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, " # noqa:C413,E501 f"dataset labels: {list(sorted(label_list))}." # noqa:C413 "\nIgnoring the model labels as a result.", ) elif args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = { id: label for label, id in config.label2id.items() } padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = \ [label_to_id[l] for l in examples["label"]] # noqa:E741 else: # In all cases, rename the column to labels because the model # will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data # collator that will just convert everything to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for # us (by padding to the maximum length of the samples passed). When # using mixed precision, we add `pad_to_multiple_of=8` to pad all # tensors to multiple of 8s, which will enable the use of Tensor # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab # his length below (cause its length will be shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * \ num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) else: metric = load_metric("accuracy") # Train! total_batch_size = \ args.per_device_train_batch_size * \ accelerator.num_processes * \ args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device =" f" {args.per_device_train_batch_size}") logger.info( f" Total train batch size (w. parallel, distributed & accumulation) " f"= {total_batch_size}") logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax( dim=-1) if not is_regression else outputs.logits.squeeze() metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the # second column for the summaries (unless you specify column names for this with the `text_column` and # `summary_column` arguments). # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( data_args.target_lang is not None and data_args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names elif training_args.do_predict: column_names = datasets["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if data_args.task.startswith("translation") or isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if data_args.source_lang is not None: tokenizer.src_lang = data_args.source_lang if data_args.target_lang is not None: tokenizer.tgt_lang = data_args.target_lang # To serialize preprocess_function below, each of those four variables needs to be defined (even if we won't use # them all). source_lang, target_lang, text_column, summary_column = None, None, None, None if data_args.task.startswith("summarization"): # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column else: # Get the language codes for input/target. lang_search = re.match("translation_([a-z]+)_to_([a-z]+)", data_args.task) if data_args.source_lang is not None: source_lang = data_args.source_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a source language via --source_lang or rename your task 'translation_xx_to_yy'." source_lang = lang_search.groups()[0] if data_args.target_lang is not None: target_lang = data_args.target_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a target language via --target_lang or rename your task 'translation_xx_to_yy'." target_lang = lang_search.groups()[1] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warn( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): if data_args.task.startswith("translation"): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] else: inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: train_dataset = datasets["train"] if "train" not in datasets: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select(range(data_args.max_test_samples)) test_dataset = test_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric_name = "rouge" if data_args.task.startswith("summarization") else "sacrebleu" metric = load_metric(metric_name) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence if metric_name == "rouge": preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] else: # sacrebleu labels = [[label] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) if metric_name == "rouge": result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} else: result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) all_metrics = {} # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) if trainer.is_world_process_zero(): logger.info("***** train metrics *****") for key in sorted(metrics.keys()): logger.info(f" {key} = {metrics[key]}") save_json(metrics, os.path.join(training_args.output_dir, "train_results.json")) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="val" ) metrics = {k: round(v, 4) for k, v in metrics.items()} max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) metrics["val_samples"] = min(max_val_samples, len(eval_dataset)) if trainer.is_world_process_zero(): logger.info("***** val metrics *****") for key in sorted(metrics.keys()): logger.info(f" {key} = {metrics[key]}") save_json(metrics, os.path.join(training_args.output_dir, "val_results.json")) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Test ***") test_results = trainer.predict( test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = test_results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) metrics = {k: round(v, 4) for k, v in metrics.items()} if trainer.is_world_process_zero(): logger.info("***** test metrics *****") for key in sorted(metrics.keys()): logger.info(f" {key} = {metrics[key]}") save_json(metrics, os.path.join(training_args.output_dir, "test_results.json")) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) test_preds = [pred.strip() for pred in test_preds] output_test_preds_file = os.path.join(training_args.output_dir, "test_preds_seq2seq.txt") with open(output_test_preds_file, "w") as writer: writer.write("\n".join(test_preds)) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return results