def main(): ### Dataset processing classes in main due to hugging face custom dataset map def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while (token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char): token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Post-processing: def post_processing_function(examples, features, predictions): predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), ) if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in datasets["validation"]] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=data_args.max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples transformers.utils.logging.set_verbosity_info() parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) if data_args.dataset_name is not None: datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data") config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.layers_to_keep > 0: logger.info("Dropping %s model layers", data_args.layers_to_keep) model = dropLayers(model, data_args.layers_to_keep) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) logger.info("Model has %s parameters", params) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] pad_on_right = tokenizer.padding_side == "right" if training_args.do_train: train_dataset = datasets["train"].map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: validation_dataset = datasets["validation"].map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding(tokenizer)) current_dir = os.path.sep.join( os.path.join(__file__).split(os.path.sep)[:-1]) metric = load_metric( os.path.join(current_dir, "squad_v2_local") if data_args. version_2_with_negative else "squad") #################################################################################### # Start SparseML Integration #################################################################################### optim = load_optimizer(model, TrainingArguments) steps_per_epoch = math.ceil( len(datasets["train"]) / (training_args.per_device_train_batch_size * training_args._n_gpu)) manager = ScheduledModifierManager.from_yaml(data_args.nm_prune_config) optim = ScheduledOptimizer(optim, model, manager, steps_per_epoch=steps_per_epoch, loggers=None) #################################################################################### # End SparseML Integration #################################################################################### # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=validation_dataset if training_args.do_eval else None, eval_examples=datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, optimizers=(optim, None), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") #################################################################################### # Start SparseML Integration #################################################################################### if data_args.do_onnx_export: logger.info("*** Export to ONNX ***") print("Exporting onnx model") os.environ["TOKENIZERS_PARALLELISM"] = "false" exporter = ModuleExporter(model, output_dir='onnx-export') sample_batch = convert_example_to_features( datasets["validation"][0], tokenizer, data_args.max_seq_length, data_args.doc_stride, data_args.max_query_length, ) exporter.export_onnx(sample_batch=sample_batch)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: train_dataset = datasets.load_dataset("dialect_speech_corpus", split="train+dev", cache_dir=model_args.cache_dir) eval_dataset = datasets.load_dataset("dialect_speech_corpus", split="test", cache_dir=model_args.cache_dir) feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True) processor = CustomWav2Vec2Processor(feature_extractor=feature_extractor) model = Wav2Vec2KlaamModelMean.from_pretrained( "facebook/wav2vec2-large-xlsr-53", attention_dropout=0.01, hidden_dropout=0.01, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.01, gradient_checkpointing=True, ) if model_args.freeze_feature_extractor: model.freeze_feature_extractor() if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range( data_args.max_train_samples)) if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) # Preprocessing the datasets. # We need to read the aduio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): start = 0 stop = 20 srate = 16_000 speech_array, sampling_rate = sf.read(batch["file"], start=start * srate, stop=stop * srate) batch["speech"] = librosa.resample(np.asarray(speech_array), sampling_rate, srate) batch["sampling_rate"] = srate batch["parent"] = batch["label"] return batch train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values batch["labels"] = batch["parent"] return batch train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) from sklearn.metrics import classification_report, confusion_matrix def compute_metrics(pred): label_idx = [0, 1, 2, 3, 4] label_names = ['EGY', 'NOR', 'GLF', 'LAV', 'MSA'] labels = pred.label_ids.argmax(-1) preds = pred.predictions.argmax(-1) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds, average='macro') report = classification_report(y_true=labels, y_pred=preds, labels=label_idx, target_names=label_names) matrix = confusion_matrix(y_true=labels, y_pred=preds) print(report) print(matrix) wandb.log({ "conf_mat": wandb.plot.confusion_matrix(probs=None, y_true=labels, preds=preds, class_names=label_names) }) wandb.log({ "precision_recall": wandb.plot.pr_curve(y_true=labels, y_probas=pred.predictions, labels=label_names) }) return {"accuracy": acc, "f1_score": f1} wandb.init(name=training_args.output_dir, config=training_args) # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) elif data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files={ "train": data_args.train_file, "validation": data_args.validation_file }) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" max_length = data_args.max_seq_length else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False max_length = None # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. data_collator=default_data_collator if data_args.pad_to_max_length else None, ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if args.text_column_name is not None: text_column_name = args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if args.label_column_name is not None: label_column_name = args.label_column_name elif f"{args.task_name}_tags" in column_names: label_column_name = f"{args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) if args.model_name_or_path: model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("seqeval") def get_labels(predictions, references): # Transform predictions and references tensos to numpy arrays if device.type == "cpu": y_pred = predictions.detach().clone().numpy() y_true = references.detach().clone().numpy() else: y_pred = predictions.detach().cpu().clone().numpy() y_true = references.detach().cpu().clone().numpy() # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) labels = batch["labels"] if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered = accelerator.gather(predictions) labels_gathered = accelerator.gather(labels) preds, refs = get_labels(predictions_gathered, labels_gathered) metric.add_batch( predictions=preds, references=refs, ) # predictions and preferences are expected to be a nested list of labels, not label_ids # eval_metric = metric.compute() eval_metric = compute_metrics() accelerator.print(f"epoch {epoch}:", eval_metric) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() task_name = data_args.task lang_id = data_args.language if task_name is None: raise ValueError( "Set --task should be set to '<xtreme_s_task>' (e.g. 'fleurs-asr', 'mls', 'covost2', 'minds14') " ) if lang_id is None: raise ValueError( "Set --language should be set to the language id of the sub dataset " "config to be used (e.g. 'pl', 'en.tr', 'fr-FR') or 'all'" " for multi-lingual fine-tuning.") if data_args.language_group is not None: if data_args.task != "fleurs-asr": raise ValueError( "--language_group should only be used with --task=fleurs-asr") if data_args.language != "all": raise ValueError( "--language_group should only be used with --language=all") if data_args.target_column_name is None: target_column_name = TASK_TO_TARGET_COLUMN_NAME[task_name] else: target_column_name = data_args.target_column_name # here we differentiate between tasks with text as the target and classification tasks is_text_target = target_column_name in ("transcription", "translation") config_name = ".".join([task_name.split("-")[0], lang_id]) if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." " Make sure to set `--audio_column_name` to the correct audio column - one of" f" {', '.join(raw_datasets['train'].column_names)}.") if target_column_name not in raw_datasets["train"].column_names: raise ValueError( f"--target_column_name {target_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--target_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) if training_args.do_predict: raw_datasets["predict"] = load_dataset( data_args.dataset_name, config_name, split=data_args.predict_split_name, use_auth_token=data_args.use_auth_token, cache_dir=model_args.cache_dir, ) if data_args.max_predict_samples is not None: raw_datasets["predict"] = raw_datasets["predict"].select( range(data_args.max_predict_samples)) lang_list = next(iter(raw_datasets.values())).features["lang_id"].names if not is_text_target: label_list = next(iter( raw_datasets.values())).features[target_column_name].names num_labels = len(label_list) num_workers = data_args.preprocessing_num_workers lang_group = data_args.language_group if lang_group is not None: with training_args.main_process_first(desc="language group filter"): lang_group_id = next(iter( raw_datasets.values())).features["lang_group_id"].str2int( lang_group) raw_datasets = raw_datasets.filter( lambda lang_group: lang_group == lang_group_id, num_proc=num_workers, input_columns=["lang_group_id"], ) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[target_column_name]).lower() + " " else: batch["target_text"] = batch[target_column_name].lower() + " " return batch if is_text_target: with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[target_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) if is_text_target: # 4. (Optional, for ASR and translation) If no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): os.remove(vocab_file) with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` if not config.is_encoder_decoder: tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } else: tokenizer_kwargs = {} # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer if is_text_target: tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config # (speech translation requires pre-configured seq2seq models) if task_name != "covost2": config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_zero_infinity": model_args.ctc_zero_infinity, "ctc_loss_reduction": model_args.ctc_loss_reduction, "activation_dropout": model_args.activation_dropout, }) if training_args.do_train: if is_text_target: config.pad_token_id = tokenizer.pad_token_id config.vocab_size = len(tokenizer) else: label_to_id = {v: i for i, v in enumerate(label_list)} config.label2id = label_to_id config.id2label = { id: label for label, id in label_to_id.items() } config.num_labels = num_labels # create model if target_column_name == "transcription": model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) elif config.is_encoder_decoder: model = AutoModelForSpeechSeq2Seq.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) else: model = AutoModelForAudioClassification.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language if is_text_target: batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids else: batch["labels"] = batch[target_column_name] batch["lang"] = batch["lang_id"] return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) if training_args.do_train: def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets["train"] = vectorized_datasets["train"].filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["length"], ) # 7. Next, we can prepare for the training step. # Let's use the appropriate XTREME-S evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metric = load_metric("xtreme_s", task_name) # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def asr_logits_argmax(logits, labels): return logits.argmax(dim=-1) def compute_asr_metric(pred): pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred.predictions) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metric = eval_metric.compute(predictions=pred_str, references=label_str) return metric def compute_classification_metric(pred): pred_ids = np.argmax(pred.predictions, axis=1) metric = eval_metric.compute(predictions=pred_ids, references=pred.label_ids) return metric # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) if is_text_target: tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) # wait until configs are saved in the main process before loading the processor if training_args.local_rank != -1: torch.distributed.barrier() if is_text_target: processor = AutoProcessor.from_pretrained(training_args.output_dir) else: processor = AutoFeatureExtractor.from_pretrained( training_args.output_dir) # Instantiate custom data collator data_collator = SpeechDataCollatorWithPadding(processor=processor, pad_labels=is_text_target) # Initialize Trainer if target_column_name == "translation": trainer = Seq2SeqTrainer( model=model, data_collator=data_collator, args=training_args, preprocess_logits_for_metrics=asr_logits_argmax if training_args.predict_with_generate else None, compute_metrics=compute_asr_metric if training_args.predict_with_generate else None, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) else: trainer = Trainer( model=model, data_collator=data_collator, args=training_args, preprocess_logits_for_metrics=asr_logits_argmax if is_text_target else None, compute_metrics=compute_asr_metric if is_text_target else compute_classification_metric, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation on the test set results = {} if training_args.do_predict: logger.info( f"*** Evaluating on the `{data_args.predict_split_name}` set ***") if data_args.per_lang_metrics: # separate the `test` dataset into language-specific subsets and compute metrics for each of them metrics = {} average_metrics = defaultdict(list) for lang_id in range(len(lang_list)): lang_name = lang_list[lang_id] with training_args.main_process_first( desc="per-language dataset filter"): lang_dataset = vectorized_datasets["predict"].filter( lambda lang: lang == lang_id, num_proc=num_workers, input_columns=["lang"], ) lang_metrics = trainer.evaluate(lang_dataset) redundant_metrics = [ "eval_runtime", "eval_samples_per_second", "eval_steps_per_second", "eval_epoch" ] for metric_name, value in lang_metrics.items(): average_metrics[metric_name].append(value) if metric_name not in redundant_metrics: metrics[f"{metric_name}_{lang_name}"] = value for metric_name, value in average_metrics.items(): metrics[metric_name] = np.mean(value) else: metrics = trainer.evaluate(vectorized_datasets["predict"]) max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(vectorized_datasets["predict"])) metrics["predict_samples"] = min(max_predict_samples, len(vectorized_datasets["predict"])) # make sure that the `predict` metrics end up in the log history for the model card trainer.log(OrderedDict(sorted(metrics.items()))) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Write model card and (optionally) push to hub kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": task_name, "tags": [task_name, data_args.dataset_name], "dataset_args": (f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" f" {data_args.eval_split_name}, Predict split: {data_args.predict_split_name}" ), "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", "language": data_args.language, } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def __init__(self, config) -> None: logger = logging.getLogger(__name__) if config.get("output") is not None: os.makedirs(config.get("output"), exist_ok=True) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator(cpu=config["cpu"]) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO if accelerator. is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: transformers.utils.logging.set_verbosity_info() else: transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if config.get("seed") is not None: set_seed(config.get("seed")) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. file_format = config.get("file_format") input_dir = Path(config.get("input")) if file_format in ["bio", "bies", "conll"]: train_file = input_dir / "train.txt" dev_file = input_dir / "dev.txt" else: train_file = input_dir / "train.json" dev_file = input_dir / "dev.json" tokenizer = AutoTokenizer.from_pretrained(config.get("model_path"), use_fast=True) train_dataset = NerBertDataset(train_file, tokenizer, config.get("max_length"), file_format=file_format, do_lower=config.get("do_lower_case")) dev_dataset = NerBertDataset(dev_file, tokenizer, config.get("max_length"), file_format=file_format, do_lower=config.get("do_lower_case")) if file_format == "split": dev_contents = dev_dataset.get_contents() dev_offset_lists = dev_dataset.get_offset_lists() # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. label_file = Path(config.get("output")) / "label.txt" train_dataset.save_label(label_file) label_list = train_dataset.get_label_list() label_to_id = train_dataset.get_label_to_id() print(label_to_id) num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. pretrained_config = AutoConfig.from_pretrained( config.get("model_path"), num_labels=num_labels) model_name = config.get("name").lower() if model_name == "bert_crf": model_func = BertCrf elif model_name == "bert_softmax": model_func = BertSoftmax elif model_name == "bert_lstm_crf": model_func = BertLstmCrf elif model_name == "bert_biaffine": model_func = BertBiaffine elif model_name == "albert_tiny_crf": model_func = AlbertTinyCrf elif model_name == "albert_tiny_softmax": model_func = AlbertTinySoftmax else: raise ValueError pretrained_config.loss_name = config.get("loss_name") model = model_func.from_pretrained(config.get("model_path"), config=pretrained_config) # model.resize_token_embeddings(len(tokenizer)) # Preprocessing the raw_datasets. # First we tokenize all the texts. train_dataloader = NerBertDataLoader( train_dataset, batch_size=config.get("per_device_train_batch_size"), shuffle=True, drop_last=False) dev_dataloader = NerBertDataLoader( dev_dataset, batch_size=config.get("per_device_dev_batch_size"), shuffle=False, drop_last=False) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"] weight_decay = config.get("weight_decay") model_type = config.get("model_type") plm_lr = config.get("plm_lr") not_plm_lr = config.get("not_plm_lr") optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and model_type not in n ], "weight_decay": weight_decay, "lr": not_plm_lr }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and model_type not in n ], "weight_decay": 0.0, "lr": not_plm_lr }, { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and model_type in n ], "weight_decay": weight_decay, "lr": plm_lr }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and model_type in n ], "weight_decay": 0.0, "lr": plm_lr }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=plm_lr) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, dev_dataloader = accelerator.prepare( model, optimizer, train_dataloader, dev_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / config.get("gradient_accumulation_steps")) config["max_train_steps"] = config.get( "num_train_epochs") * num_update_steps_per_epoch lr_scheduler = get_scheduler( name=config.get("lr_scheduler_type"), optimizer=optimizer, num_warmup_steps=config.get("num_warmup_steps"), num_training_steps=config.get("max_train_steps"), ) # Train! total_batch_size = config.get( "per_device_train_batch_size" ) * accelerator.num_processes * config.get( "gradient_accumulation_steps") logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {config.get('num_train_epochs')}") logger.info( f" Instantaneous batch size per device = {config.get('per_device_train_batch_size')}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {config.get('gradient_accumulation_steps')}" ) logger.info( f" Total optimization steps = {config.get('max_train_steps')}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(config.get("max_train_steps")), disable=not accelerator.is_local_main_process) writer = SummaryWriter(config.get("summary")) completed_steps = 0 best_f1 = 0 for epoch in range(config.get("num_train_epochs")): model.train() train_loss = 0.0 dev_loss = 0.0 for step, batch in enumerate(train_dataloader): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], "label_mask": batch[4], "input_len": batch[5] } outputs = model(**inputs) loss = outputs loss = loss / config.get("gradient_accumulation_steps") accelerator.backward(loss) train_loss += loss.item() if step % config.get( "gradient_accumulation_steps") == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= config.get("max_train_steps"): break if step == 10: break model.eval() device_type = device.type decode_type = config.get("decode_type") pred_lists = list() gold_lists = list() for step, batch in enumerate(dev_dataloader): with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "label_mask": batch[4], "input_len": batch[5] } outputs = model(**inputs) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], "label_mask": batch[4], "input_len": batch[5] } loss = model(**inputs) dev_loss += loss.item() labels = batch[3] predictions_gathered = accelerator.gather(outputs) labels_gathered = accelerator.gather(labels) preds, golds = get_labels(predictions_gathered, labels_gathered, label_list, masks=batch[6], decode_type=decode_type, device=device_type) pred_lists += preds gold_lists += golds for token, gold in zip(batch[0], golds): print("+++++++++++++++++++++") print(tokenizer.convert_ids_to_tokens(list(token))) print(get_interval(gold)) if step == 5: exit(1) if file_format == "split": new_pred_lists = list() new_gold_lists = list() start_idx = 0 for dev_content, dev_offset_list in zip( dev_contents, dev_offset_lists): end_idx = start_idx + len(dev_offset_list) pred_list = recover(dev_content, pred_lists[start_idx:end_idx], dev_offset_list) gold_list = recover(dev_content, gold_lists[start_idx:end_idx], dev_offset_list) new_pred_lists.append(pred_list) new_gold_lists.append(gold_list) start_idx = end_idx pred_lists = new_pred_lists gold_lists = new_gold_lists accelerator.print(f"\nepoch: {epoch}") f1, table = get_f1(gold_lists, pred_lists, format=file_format) writer.add_scalars("f1", {"dev": round(100 * f1, 2)}, epoch + 1) if file_format != "biaffine": train_loss /= 100.0 dev_loss /= 10.0 else: train_loss *= 10.0 dev_loss *= 10.0 writer.add_scalars('loss', { "train": round(train_loss, 2), "dev": round(dev_loss, 2) }, epoch + 1) if f1 > best_f1: best_f1 = f1 print(table) output_model = config.get("output") output_model_file = os.path.join(output_model, "pytorch_model.bin") torch.save(model.state_dict(), output_model_file) output_config_file = os.path.join(output_model, "config.json") with open(output_config_file, 'w') as f: f.write(model.config.to_json_string())
def main(model_args, data_args, training_args): last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: eval_dataset = datasets.load_dataset(CORPORA_PATH, split="test", cache_dir=model_args.cache_dir) processor = CustomWav2Vec2Processor.from_pretrained(model_args.model_name_or_path) model = Wav2VecClassifierModel.from_pretrained( model_args.model_name_or_path, attention_dropout=0.01, hidden_dropout=0.01, feat_proj_dropout=0.0, mask_time_prob=0.05, layerdrop=0.01, gradient_checkpointing=True, ) if data_args.max_val_samples is not None: max_val_samples = min(data_args.max_val_samples, len(eval_dataset)) eval_dataset = eval_dataset.select(range(max_val_samples)) # Preprocessing the datasets. # We need to read the aduio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): start = 0 stop = SECONDS_STOP srate = S_RATE speech_array, sampling_rate = torchaudio.load(batch["file"]) speech_array = speech_array[0].numpy()[:stop * sampling_rate] batch["speech"] = librosa.resample(np.asarray(speech_array), sampling_rate, srate) batch["sampling_rate"] = srate batch["parent"] = batch["label"] return batch eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers ) def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values batch["labels"] = batch["parent"] return batch eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) from sklearn.metrics import classification_report, confusion_matrix def map_multi_to_bin(indexes): return np.where(indexes <= 2, 0, 1) def compute_metrics(pred): label_idx = LABEL_IDX label_names = LABEL_NAMES labels = pred.label_ids.argmax(-1) preds = pred.predictions.argmax(-1) labels = map_multi_to_bin(labels) preds = map_multi_to_bin(preds) acc = accuracy_score(labels, preds) f1 = f1_score(labels, preds, average='macro') report = classification_report(y_true=labels, y_pred=preds, labels=label_idx, target_names=label_names) matrix = confusion_matrix(y_true=labels, y_pred=preds) print(report) print(matrix) wandb.log( {"conf_mat": wandb.plot.confusion_matrix(probs=None, y_true=labels, preds=preds, class_names=label_names)}) wandb.log( {"precision_recall": wandb.plot.pr_curve(y_true=labels, y_probas=pred.predictions, labels=label_names)}) return {"accuracy": acc, "f1_score": f1} wandb.init(name=training_args.output_dir, config=training_args) # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError( "Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # load tapex tokenizer tokenizer = TapexTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, add_prefix_space=True, ) model = BartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. model.config.label2id = {"Refused": 0, "Entailed": 1} model.config.id2label = {0: "Refused", 1: "Entailed"} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_tabfact_function(examples): # Tokenize the texts def _convert_table_text_to_pandas(_table_text): """Runs the structured pandas table object for _table_text. An example _table_text can be: round#clubs remaining\nfirst round#156\n """ _table_content = [ _table_row.split("#") for _table_row in _table_text.strip("\n").split("\n") ] _table_pd = pd.DataFrame.from_records(_table_content[1:], columns=_table_content[0]) return _table_pd questions = examples["statement"] tables = list( map(_convert_table_text_to_pandas, examples["table_text"])) result = tokenizer(tables, questions, padding=padding, max_length=max_seq_length, truncation=True) result["label"] = examples["label"] return result with training_args.main_process_first(desc="dataset map pre-processing"): raw_datasets = raw_datasets.map( preprocess_tabfact_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) if training_args.do_predict or data_args.test_file is not None: if "test" not in raw_datasets and "test_matched" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) # Log a few random samples from the training set: if training_args.do_train: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.argmax(preds, axis=1) return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(eval_dataset=eval_dataset) max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") # Removing the `label` columns because it contains -1 and Trainer won't like that. predict_dataset = predict_dataset.remove_columns("label") predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions predictions = np.argmax(predictions, axis=1) output_predict_file = os.path.join(training_args.output_dir, "predict_results_tabfact.txt") if trainer.is_world_process_zero(): with open(output_predict_file, "w") as writer: logger.info("***** Predict Results *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n") kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "text-classification" } if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = XLNetConfig() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = XLNetLMHeadModel.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = XLNetLMHeadModel.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) # Add the chinese references if provided if data_args.train_ref_file is not None: tokenized_datasets["train"] = add_chinese_references( tokenized_datasets["train"], data_args.train_ref_file) if data_args.valid_ref_file is not None: tokenized_datasets["validation"] = add_chinese_references( tokenized_datasets["validation"], data_args.validation_ref_file) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
import transformers from pathlib import Path import os import json import torch from transformers import (AutoModelForSequenceClassification, AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForTokenClassification, AutoConfig) from transformers import set_seed """ This function, save the checkpoint, config file along with tokenizer config and vocab files of a transformer model of your choice. """ print('Transformers version', transformers.__version__) set_seed(1) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') def transformers_model_dowloader(mode, pretrained_model_name, num_labels, do_lower_case, max_length, torchscript): print("Download model and tokenizer", pretrained_model_name) #loading pre-trained model and tokenizer if mode == "sequence_classification": config = AutoConfig.from_pretrained(pretrained_model_name, num_labels=num_labels, torchscript=torchscript) model = AutoModelForSequenceClassification.from_pretrained( pretrained_model_name, config=config) tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, do_lower_case=do_lower_case) elif mode == "question_answering": config = AutoConfig.from_pretrained(pretrained_model_name,
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() model_args: ModelArguments data_args: DataTrainingArguments training_args: TrainingArguments if training_args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # if training_args.do_eval and not training_args.do_train and not data_args.predictions_folder: # raise ValueError("Supply predictions folder destination to save the predictions!") logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.debug(model_args) logger.debug(training_args) logger.debug(data_args) # raise NotImplementedError if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " f"Use --overwrite_output_dir to overcome.") # Set seed set_seed(training_args.seed) if training_args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() tokenizer = get_tokenizer(model_args.model_name_or_path, do_lower_case=False) if data_args.model_parallel == 4: model = T5ForConditionalGeneration4WayParallel.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) elif data_args.model_parallel == 2: model = T5ForConditionalGeneration2WayParallel.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) elif data_args.model_parallel is None: model = T5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) else: raise ValueError( f"Can only have no, 2way or 4way model parallelism! (expected: {data_args.model_parallel})" ) if training_args.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() # Get datasets if training_args.do_eval and training_args.local_rank in [-1, 0]: eval_dataset, examples = get_dataset(data_args.eval_file_path, tokenizer, data_args, evaluate=True) else: eval_dataset, examples = None, None # Training if training_args.do_train: if training_args.local_rank in [-1, 0]: train_dataset, _ = get_dataset(data_args.train_file_path, tokenizer, data_args) torch.save(train_dataset, 'features.bin') else: torch.distributed.barrier() train_dataset = None if training_args.local_rank == 0: torch.distributed.barrier() else: train_dataset = torch.load('features.bin') # Initialize our Trainer if data_args.model_parallel: trainer = MyTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=collate_training, prediction_loss_only=True) model.set_parallel() else: trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=collate_training, prediction_loss_only=True) trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval and training_args.local_rank in [-1, 0]: if training_args.do_train: model_path = os.path.basename(training_args.output_dir) else: model_path = os.path.basename(model_args.model_name_or_path) checkpoints = [training_args.output_dir] if data_args.eval_all_checkpoints and training_args.do_train: logger.info( "Loading checkpoints saved during training for evaluation") checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(training_args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) # logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info(f"Evaluate the following checkpoints: {checkpoints}") results = {} logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split("-")[-1] if not all(s in string.digits for s in global_step): global_step = '' # no model parallelism here (didnt check model.generate) model = T5ForConditionalGeneration.from_pretrained(checkpoint) device = torch.device("cuda" if torch.cuda.is_available() and not training_args.no_cuda else "cpu") model.to(device) model_str = f'{model_path}-{global_step}' if global_step else model_path # Note that DistributedSampler samples click.echo( f"Generating predictions for model {click.style(model_str, fg='blue')}, " f"running on {click.style(str(training_args.device), fg='green')}" ) predictions = generate_predictions(eval_dataset, examples, model, tokenizer, training_args) final_metric = squad_evaluate(examples, predictions) if is_wandb_available(): if training_args.do_train: step = int( global_step) if global_step else trainer.global_step else: step = 0 # for now WANDB cannot 'log back in time' wandb.log(final_metric, step=step) print(f"GLOBAL STEP: {global_step}") result = dict( (k + ("_{}".format(global_step) if global_step else '_final'), v) for k, v in final_metric.items()) logger.info(f"Result for {model_str}: {result}") results.update(result) # sort results by best checkpoint_scores = { c.split('_')[-1]: v for c, v in results.items() if any(c.endswith(digit) for digit in string.digits) and c.startswith('exact') } sorted_checkpoint_scores = { k: v for k, v in sorted(checkpoint_scores.items(), key=lambda k_v: k_v[1], reverse=True) } best_cp = next((c for c, v in sorted_checkpoint_scores.items() if v > results['exact_final']), None) if best_cp: click.echo(f"Best checkpoint is: {best_cp}") # copy over best results best_cp_folder = f'checkpoint-{best_cp}' click.echo( f"Copying over files: from {os.path.join(training_args.output_dir, best_cp_folder)} " f"to {training_args.output_dir}") files_to_copy = glob.glob( os.path.join(training_args.output_dir, best_cp_folder, '*')) for file in files_to_copy: shutil.copy(file, training_args.output_dir) else: click.echo("best checkpoint is the last step...") # remove 'kek'points folders_to_remove = [ p for p in glob.glob(os.path.join(training_args.output_dir, '*')) if os.path.isdir(p) ] click.echo('Folders to remove: ') for folder in folders_to_remove: click.echo(f"Removing {folder}") shutil.rmtree(folder) if training_args.do_train: logger.info(results) write_json( results, os.path.join(training_args.output_dir, 'dev-results.json')) else: write_json( predictions, get_output_predictions_file_name( data_args.eval_file_path, training_args.output_dir, os.path.basename( os.path.normpath(model_args.model_name_or_path))))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_speech_recognition_ctc", model_args, data_args) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 1. First, let's load the dataset raw_datasets = DatasetDict() if training_args.do_train: raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.train_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.audio_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'." " Make sure to set `--audio_column_name` to the correct audio column - one of" f" {', '.join(raw_datasets['train'].column_names)}.") if data_args.text_column_name not in raw_datasets[ "train"].column_names: raise ValueError( f"--text_column_name {data_args.text_column_name} not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--text_column_name` to the correct text column - one of " f"{', '.join(raw_datasets['train'].column_names)}.") if data_args.max_train_samples is not None: raw_datasets["train"] = raw_datasets["train"].select( range(data_args.max_train_samples)) if training_args.do_eval: raw_datasets["eval"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=data_args.eval_split_name, use_auth_token=data_args.use_auth_token, ) if data_args.max_eval_samples is not None: raw_datasets["eval"] = raw_datasets["eval"].select( range(data_args.max_eval_samples)) # 2. We remove some special characters from the datasets # that make training complicated and do not help in transcribing the speech # E.g. characters, such as `,` and `.` do not really have an acoustic characteristic # that could be easily picked up by the model chars_to_ignore_regex = (f'[{"".join(data_args.chars_to_ignore)}]' if data_args.chars_to_ignore is not None else None) text_column_name = data_args.text_column_name def remove_special_characters(batch): if chars_to_ignore_regex is not None: batch["target_text"] = re.sub( chars_to_ignore_regex, "", batch[text_column_name]).lower() + " " else: batch["target_text"] = batch[text_column_name].lower() + " " return batch with training_args.main_process_first( desc="dataset map special characters removal"): raw_datasets = raw_datasets.map( remove_special_characters, remove_columns=[text_column_name], desc="remove special characters from datasets", ) # save special tokens for tokenizer word_delimiter_token = data_args.word_delimiter_token unk_token = data_args.unk_token pad_token = data_args.pad_token # 3. Next, let's load the config as we might need it to create # the tokenizer # load config config = AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # 4. Next, if no tokenizer file is defined, # we create the vocabulary of the model by extracting all unique characters from # the training and evaluation datasets # We need to make sure that only first rank saves vocabulary # make sure all processes wait until vocab is created tokenizer_name_or_path = model_args.tokenizer_name_or_path tokenizer_kwargs = {} if tokenizer_name_or_path is None: # save vocab in training output dir tokenizer_name_or_path = training_args.output_dir vocab_file = os.path.join(tokenizer_name_or_path, "vocab.json") with training_args.main_process_first(): if training_args.overwrite_output_dir and os.path.isfile( vocab_file): try: os.remove(vocab_file) except OSError: # in shared file-systems it might be the case that # two processes try to delete the vocab file at the some time pass with training_args.main_process_first( desc="dataset map vocabulary creation"): if not os.path.isfile(vocab_file): os.makedirs(tokenizer_name_or_path, exist_ok=True) vocab_dict = create_vocabulary_from_data( raw_datasets, word_delimiter_token=word_delimiter_token, unk_token=unk_token, pad_token=pad_token, ) # save vocab dict to be loaded into tokenizer with open(vocab_file, "w") as file: json.dump(vocab_dict, file) # if tokenizer has just been created # it is defined by `tokenizer_class` if present in config else by `model_type` tokenizer_kwargs = { "config": config if config.tokenizer_class is not None else None, "tokenizer_type": config.model_type if config.tokenizer_class is None else None, "unk_token": unk_token, "pad_token": pad_token, "word_delimiter_token": word_delimiter_token, } # 5. Now we can instantiate the feature extractor, tokenizer and model # Note for distributed training, the .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. # load feature_extractor and tokenizer tokenizer = AutoTokenizer.from_pretrained( tokenizer_name_or_path, use_auth_token=data_args.use_auth_token, **tokenizer_kwargs, ) feature_extractor = AutoFeatureExtractor.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=data_args.use_auth_token) # adapt config config.update({ "feat_proj_dropout": model_args.feat_proj_dropout, "attention_dropout": model_args.attention_dropout, "hidden_dropout": model_args.hidden_dropout, "final_dropout": model_args.final_dropout, "mask_time_prob": model_args.mask_time_prob, "mask_time_length": model_args.mask_time_length, "mask_feature_prob": model_args.mask_feature_prob, "mask_feature_length": model_args.mask_feature_length, "gradient_checkpointing": training_args.gradient_checkpointing, "layerdrop": model_args.layerdrop, "ctc_loss_reduction": model_args.ctc_loss_reduction, "pad_token_id": tokenizer.pad_token_id, "vocab_size": len(tokenizer), "activation_dropout": model_args.activation_dropout, }) # create model model = AutoModelForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, config=config, use_auth_token=data_args.use_auth_token, ) # freeze encoder if model_args.freeze_feature_encoder: model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, # so that we just need to set the correct target sampling rate and normalize the input # via the `feature_extractor` # make sure that dataset decodes audio with correct sampling rate dataset_sampling_rate = next(iter(raw_datasets.values())).features[ data_args.audio_column_name].sampling_rate if dataset_sampling_rate != feature_extractor.sampling_rate: raw_datasets = raw_datasets.cast_column( data_args.audio_column_name, datasets.features.Audio( sampling_rate=feature_extractor.sampling_rate)) # derive max & min input length for sample rate & max duration max_input_length = data_args.max_duration_in_seconds * feature_extractor.sampling_rate min_input_length = data_args.min_duration_in_seconds * feature_extractor.sampling_rate audio_column_name = data_args.audio_column_name num_workers = data_args.preprocessing_num_workers # `phoneme_language` is only relevant if the model is fine-tuned on phoneme classification phoneme_language = data_args.phoneme_language # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def prepare_dataset(batch): # load audio sample = batch[audio_column_name] inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"]) batch["input_values"] = inputs.input_values[0] batch["input_length"] = len(batch["input_values"]) # encode targets additional_kwargs = {} if phoneme_language is not None: additional_kwargs["phonemizer_lang"] = phoneme_language batch["labels"] = tokenizer(batch["target_text"], **additional_kwargs).input_ids return batch with training_args.main_process_first(desc="dataset map preprocessing"): vectorized_datasets = raw_datasets.map( prepare_dataset, remove_columns=next(iter(raw_datasets.values())).column_names, num_proc=num_workers, desc="preprocess datasets", ) def is_audio_in_length_range(length): return length > min_input_length and length < max_input_length # filter data that is shorter than min_input_length vectorized_datasets = vectorized_datasets.filter( is_audio_in_length_range, num_proc=num_workers, input_columns=["input_length"], ) # 7. Next, we can prepare the training. # Let's use word error rate (WER) as our evaluation metric, # instantiate a data collator and the trainer # Define evaluation metrics during training, *i.e.* word error rate, character error rate eval_metrics = { metric: evaluate.load(metric) for metric in data_args.eval_metrics } # for large datasets it is advised to run the preprocessing on a # single machine first with ``args.preprocessing_only`` since there will mostly likely # be a timeout when running the script in distributed mode. # In a second step ``args.preprocessing_only`` can then be set to `False` to load the # cached dataset if data_args.preprocessing_only: logger.info( f"Data preprocessing finished. Files cached at {vectorized_datasets.cache_files}" ) return def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id pred_str = tokenizer.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False) metrics = { k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items() } return metrics # Now save everything to be able to create a single processor later if is_main_process(training_args.local_rank): # save feature extractor, tokenizer and config feature_extractor.save_pretrained(training_args.output_dir) tokenizer.save_pretrained(training_args.output_dir) config.save_pretrained(training_args.output_dir) try: processor = AutoProcessor.from_pretrained(training_args.output_dir) except (OSError, KeyError): warnings.warn( "Loading a processor from a feature extractor config that does not" " include a `processor_class` attribute is deprecated and will be removed in v5. Please add the following " " attribute to your `preprocessor_config.json` file to suppress this warning: " " `'processor_class': 'Wav2Vec2Processor'`", FutureWarning, ) processor = Wav2Vec2Processor.from_pretrained(training_args.output_dir) # Instantiate custom data collator data_collator = DataCollatorCTCWithPadding(processor=processor) # Initialize Trainer trainer = Trainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=vectorized_datasets["train"] if training_args.do_train else None, eval_dataset=vectorized_datasets["eval"] if training_args.do_eval else None, tokenizer=feature_extractor, ) # 8. Finally, we can start training # Training if training_args.do_train: # use last checkpoint if exist if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(vectorized_datasets["train"])) metrics["train_samples"] = min(max_train_samples, len(vectorized_datasets["train"])) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = (data_args.max_eval_samples if data_args.max_eval_samples is not None else len( vectorized_datasets["eval"])) metrics["eval_samples"] = min(max_eval_samples, len(vectorized_datasets["eval"])) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Write model card and (optionally) push to hub config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na" kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "speech-recognition", "tags": ["automatic-speech-recognition", data_args.dataset_name], "dataset_args": (f"Config: {config_name}, Training split: {data_args.train_split_name}, Eval split:" f" {data_args.eval_split_name}"), "dataset": f"{data_args.dataset_name.upper()} - {config_name.upper()}", } if "common_voice" in data_args.dataset_name: kwargs["language"] = config_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # n_sample for evluating the models during training training_args.k_out = data_args.k_out training_args.data_dir = data_args.data_dir # Ensure output dir is not existed if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError(f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) config = BartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = BartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) ## TODO special token format: <E1>, <E2>, ... <P1>, <P2> ... special_tokens = ['<E{}>'.format(i) for i in range(data_args.n_special_tokens)] + ['<P{}>'.format(i) for i in range(10)] tokenizer.add_tokens(special_tokens) model = BartForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) model.resize_token_embeddings(len(tokenizer)) # use task specific params, e.g., data_args.task = 'summarization' use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance(tokenizer, MBartTokenizer): assert ( data_args.tgt_lang is not None and data_args.src_lang is not None ), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) # Get datasets train_dataset = ( LegacySeq2SeqDataset( tokenizer=tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None ) eval_dataset = ( LegacySeq2SeqDataset( tokenizer=tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None ) test_dataset = ( LegacySeq2SeqDataset( tokenizer=tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None ) trainer = Seq2SeqTrainer( model=model, config=config, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), data_args=data_args, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None) # Evaluation (on dev set) eval_results = {} if training_args.do_eval: output = trainer.evaluate() predictions = output.predictions.tolist() out_pred_path = training_args.output_dir + '/output_pred_dev.txt' out_pred_metric = training_args.output_dir + '/output_metric_dev.json' out_pred_ref = data_args.data_dir + '/val.target' with open(out_pred_path, 'w') as eval_out: for pred in predictions: output_line = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False) eval_out.write(output_line + '\n') metrics = {'epoch': 'eval_mode'} metrics.update(eval_top1_acc(out_pred_path, out_pred_ref, data_args.k_out)) ## top1_metrics metrics.update(eval_topk_acc(out_pred_path, out_pred_ref, data_args.k_out)) ## topk_metrics metrics.update(eval_diversity(out_pred_path, data_args.k_out)) ## diversity_metrics with open(out_pred_metric, 'w') as metric_out: json.dump(metrics, metric_out, indent=1) # Prediction (on test set) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) predictions = test_output.predictions.tolist() out_pred_path = training_args.output_dir + '/output_pred_test.txt' out_pred_metric = training_args.output_dir + '/output_metric_test.json' out_pred_ref = data_args.data_dir + '/test.target' with open(out_pred_path, 'w') as eval_out: for pred in predictions: output_line = tokenizer.decode(pred, skip_special_tokens=True, clean_up_tokenization_spaces=False) eval_out.write(output_line + '\n') metrics = {'epoch': 'test_mode'} metrics.update(eval_top1_acc(out_pred_path, out_pred_ref, data_args.k_out)) ## top1_metrics metrics.update(eval_topk_acc(out_pred_path, out_pred_ref, data_args.k_out)) ## topk_metrics metrics.update(eval_diversity(out_pred_path, data_args.k_out)) ## diversity_metrics with open(out_pred_metric, 'w') as metric_out: json.dump(metrics, metric_out, indent=1)
def train_func(config: Dict[str, Any]): args = config["args"] # Initialize the accelerator. We will let the accelerator handle device # placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on # the screen. accelerator.is_local_main_process is only True for one # process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR ) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON training and # evaluation files (see below) or specify a GLUE benchmark task (the # dataset will be downloaded automatically from the datasets Hub). # For CSV/JSON files, this script will use as labels the column called # 'label' and as pair of sentences the sentences in columns called # 'sentence1' and 'sentence2' if such column exists or the first two # columns not named label if at least two columns are provided. # If the CSVs/JSONs contain only one non-label column, the script does # single sentence classification on this single column. You can easily # tweak this behavior (see below) # In distributed training, the load_dataset function guarantee that only # one local process can concurrently download the dataset. if args.task_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset("glue", args.task_name) else: # Loading the dataset from local csv or json file. data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = ( args.train_file if args.train_file is not None else args.valid_file ).split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if args.task_name is not None: is_regression = args.task_name == "stsb" if not is_regression: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your # needs. is_regression = raw_datasets["train"].features["label"].dtype in [ "float32", "float64", ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique # noqa:E501 label_list = raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that # only one local process can concurrently download model & vocab. config = AutoConfig.from_pretrained( args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name ) tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) # Preprocessing the datasets if args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to # tweak to your use case. non_label_column_names = [ name for name in raw_datasets["train"].column_names if name != "label" ] if ( "sentence1" in non_label_column_names and "sentence2" in non_label_column_names ): sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Some models have set the order of the labels to use, # so let's make sure we do use it. label_to_id = None if ( model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and args.task_name is not None and not is_regression ): # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list( # noqa:C413 sorted(label_list) ): # noqa:C413 logger.info( f"The configuration of the model provided the following label " f"correspondence: {label_name_to_id}. Using it!" ) label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, " "but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, " # noqa:C413,E501 f"dataset labels: {list(sorted(label_list))}." # noqa:C413 "\nIgnoring the model labels as a result.", ) elif args.task_name is None: label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = {id: label for label, id in config.label2id.items()} padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): # Tokenize the texts texts = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer( *texts, padding=padding, max_length=args.max_length, truncation=True ) if "label" in examples: if label_to_id is not None: # Map labels to IDs (not necessary for GLUE tasks) result["labels"] = [ label_to_id[l] for l in examples["label"] # noqa:E741 ] else: # In all cases, rename the column to labels because the model # will expect that. result["labels"] = examples["label"] return result processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets[ "validation_matched" if args.task_name == "mnli" else "validation" ] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data # collator that will just convert everything to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for # us (by padding to the maximum length of the samples passed). When # using mixed precision, we add `pad_to_multiple_of=8` to pad all # tensors to multiple of 8s, which will enable the use of Tensor # Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size, ) eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size, ) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) # Note -> the training dataloader needs to be prepared before we grab # his length below (cause its length will be shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps ) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil( args.max_train_steps / num_update_steps_per_epoch ) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Get the metric function if args.task_name is not None: metric = load_metric("glue", args.task_name) else: metric = load_metric("accuracy") # Train! total_batch_size = ( args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps ) logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device =" f" {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) " f"= {total_batch_size}" ) logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm( range(args.max_train_steps), disable=not accelerator.is_local_main_process ) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if ( step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1 ): optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = ( outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze() ) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"epoch {epoch}: {eval_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save) if args.task_name == "mnli": # Final evaluation on mismatched validation set eval_dataset = processed_datasets["validation_mismatched"] eval_dataloader = DataLoader( eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size, ) eval_dataloader = accelerator.prepare(eval_dataloader) model.eval() for step, batch in enumerate(eval_dataloader): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) metric.add_batch( predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]), ) eval_metric = metric.compute() logger.info(f"mnli-mm: {eval_metric}")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None #training_args.output_dir = f"{training_args.output_dir}/{data_args.task_name}/{model_args.model_name_or_path}/{model_args.prune_method}/{str(model_args.prune_n_layers)}/{str(training_args.seed)}" if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError( "Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = create_model(config, model_args) # # Prune model before training starts # if (model_args.prune_method == "prune-greedy"): current_path = pathlib.Path(__file__).parent.absolute() layer_file_path = os.path.join( current_path, "layer_files/", f"{model.name_or_path}_{data_args.task_name}_greedy.txt") with open(layer_file_path, 'r') as f: layers_to_prune = f.readlines() layers_to_prune = layers_to_prune[:model_args.prune_n_layers] layers_to_prune = [int(l.replace("\n", "")) for l in layers_to_prune] print(f"Pruned {str(layers_to_prune)}") model.prune_layers(layers_to_prune) elif (model_args.prune_method == "top-layers"): print( f"# Prune {model_args.prune_n_layers} layers with {model_args.prune_method}" ) first_layer_to_prune = config.num_hidden_layers - model_args.prune_n_layers model.prune_layers( [i for i in range(first_layer_to_prune, config.num_hidden_layers)]) # # Measure number of parameters # It really depends how pruning is implemented - if its deleted from the layers # module list or if the layer is simply skipped (then torch still measures those values) # if hasattr(model.base_model, "encoder"): # base_class = model.base_model.encoder # else: # base_class = model.base_model # layers = base_class.layer # layers = [l for (i, l) in enumerate(layers) if i not in model.get_pruned_layers()] # layers = nn.ModuleList(layers) # setattr(base_class, "layer", layers) # Print number of parameters num_params = sum(p.numel() for p in model.parameters()) print("NUM Paramerers: %d" % num_params) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None or data_args.test_file is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # trainer.add_callback( # PruneCallbackHandler( # model_args.prune_method, # model_args.prune_n_layers, # data_args.task_name) # ) trainer.add_callback(DisableCheckpointCallbackHandler()) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics #trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key}: {value}") writer.write(f"{key}: {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): # eval_result = trainer.evaluate(eval_dataset=eval_dataset) start = time.time() eval_result = trainer.evaluate(eval_dataset=eval_dataset) print(f"TIMING: {(time.time() - start)} ") exit() output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key}: {value}") writer.write(f"{key}: {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
def _test_no_pad_and_pad(self, no_pad_features, pad_features): tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, (2, 10)) self.assertEqual(batch["labels"].shape, (2, 10)) batch = data_collator(pad_features, return_tensors="np") self.assertEqual(batch["input_ids"].shape, (2, 10)) self.assertEqual(batch["labels"].shape, (2, 10)) data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8, return_tensors="np") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, (2, 16)) self.assertEqual(batch["labels"].shape, (2, 16)) batch = data_collator(pad_features, return_tensors="np") self.assertEqual(batch["input_ids"].shape, (2, 16)) self.assertEqual(batch["labels"].shape, (2, 16)) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="np") with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, (2, 10)) self.assertEqual(batch["labels"].shape, (2, 10)) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(np.any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, (2, 10)) self.assertEqual(batch["labels"].shape, (2, 10)) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(np.any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, (2, 16)) self.assertEqual(batch["labels"].shape, (2, 16)) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(np.any(masked_tokens)) # self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, (2, 16)) self.assertEqual(batch["labels"].shape, (2, 16)) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(np.any(masked_tokens))
def main(): args = TrainConfig() accelerator = Accelerator() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() if args.seed is not None: set_seed(args.seed) data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, ) def group_texts(examples): concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = (total_length // max_seq_length) * max_seq_length result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=args.mlm_probability) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather(loss.repeat( args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] perplexity = math.exp(torch.mean(losses)) logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def run_multiple_choice(self, model_name, task_name, fp16): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = DataTrainingArguments(task_name=task_name, data_dir=self.data_dir, max_seq_length=self.max_seq_length) training_args = TrainingArguments( output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, per_gpu_eval_batch_size=self.eval_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: processor = SwagProcessor() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} if model_name.startswith('bert'): model_desc = { 'inputs': [( 'input_ids', ['batch', num_labels, 'max_seq_len_in_batch'], ), ( 'attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'], ), ( 'token_type_ids', ['batch', num_labels, 'max_seq_len_in_batch'], ), ( 'labels', ['batch', num_labels], )], 'outputs': [('loss', [], True), ('reshaped_logits', ['batch', num_labels])] } else: model_desc = { 'inputs': [( 'input_ids', ['batch', num_labels, 'max_seq_len_in_batch'], ), ( 'attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'], ), ( 'labels', ['batch', num_labels], )], 'outputs': [('loss', [], True), ('reshaped_logits', ['batch', num_labels])] } # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format( data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Prepare CONLL-2003 task labels = get_labels(data_args.labels) label_map: Dict[int, str] = {i: label for i, label in enumerate(labels)} num_labels = len(labels) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=label_map, label2id={label: i for i, label in enumerate(labels)}, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]: preds = np.argmax(predictions, axis=2) batch_size, seq_len = preds.shape out_label_list = [[] for _ in range(batch_size)] preds_list = [[] for _ in range(batch_size)] for i in range(batch_size): for j in range(seq_len): if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) return preds_list, out_label_list def compute_metrics(p: EvalPrediction) -> Dict: preds_list, out_label_list = align_predictions(p.predictions, p.label_ids) return { "precision": precision_score(out_label_list, preds_list), "recall": recall_score(out_label_list, preds_list), "f1": f1_score(out_label_list, preds_list), } # Initialize our Trainer import pdb; pdb.set_trace() trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Predict if training_args.do_predict: test_dataset = NerDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, labels=labels, model_type=config.model_type, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.test, ) predictions, label_ids, metrics = trainer.predict(test_dataset) preds_list, _ = align_predictions(predictions, label_ids) output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith("-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not preds_list[example_id]: example_id += 1 elif preds_list[example_id]: output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0] ) return results
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity(logging.INFO) transformers.utils.logging.set_verbosity(logging.INFO) # Log on each process the small summary: logger.info(f"Training/evaluation parameters {training_args}") # endregion # region T5 special-casing if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `" ) # endregion # region Detecting last checkpoint last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load model config and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # endregion # region Dataset preprocessing # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, and/or `do_eval`.") return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) else: train_dataset = None if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) else: eval_dataset = None # endregion # region Text preprocessing def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels # endregion with training_args.strategy.scope(): # region Prepare model model = TFAutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # endregion # region Prepare TF Dataset objects if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas tf_train_dataset = dataset_to_tf( train_dataset, model, tokenizer, total_batch_size=total_train_batch_size, num_epochs=training_args.num_train_epochs, shuffle=True, ) tf_eval_dataset = dataset_to_tf( eval_dataset, model, tokenizer, total_eval_batch_size, num_epochs=1, shuffle=False, ) # endregion # region Optimizer, loss and LR scheduling # Scheduler and math around the number of training steps. num_update_steps_per_epoch = len(train_dataset) // total_train_batch_size num_train_steps = training_args.num_train_epochs * num_update_steps_per_epoch optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=0 ) def masked_sparse_categorical_crossentropy(y_true, y_pred): # We clip the negative labels to 0 to avoid NaNs appearing in the output and # fouling up everything that comes afterwards. The loss values corresponding to clipped values # will be masked later anyway, but even masked NaNs seem to cause overflows for some reason. # 1e6 is chosen as a reasonable upper bound for the number of token indices - in the unlikely # event that you have more than 1 million tokens in your vocabulary, consider increasing this value. # More pragmatically, consider redesigning your tokenizer. losses = tf.keras.losses.sparse_categorical_crossentropy( tf.clip_by_value(y_true, 0, int(1e6)), y_pred, from_logits=True ) # Compute the per-sample loss only over the unmasked tokens losses = tf.ragged.boolean_mask(losses, y_true != -100) losses = tf.reduce_mean(losses, axis=-1) return losses # endregion # region Metric metric = load_metric("rouge") # endregion # region Training model.compile(loss={"logits": masked_sparse_categorical_crossentropy}, optimizer=optimizer) if training_args.do_train: logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {total_train_batch_size}") logger.info(f" Total optimization steps = {num_train_steps}") model.fit( tf_train_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=num_update_steps_per_epoch, ) # endregion # region Validation if data_args.val_max_target_length is None: data_args.val_max_target_length = data_args.max_target_length gen_kwargs = { "max_length": data_args.val_max_target_length if data_args is not None else config.max_length, "num_beams": data_args.num_beams, } if training_args.do_eval: logger.info("Evaluation...") for batch, labels in tqdm( tf_eval_dataset, total=len(eval_dataset) // training_args.per_device_eval_batch_size ): batch.update(gen_kwargs) generated_tokens = model.generate(**batch) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) result = metric.compute(use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} result = {k: round(v, 4) for k, v in result.items()} logger.info(result) # endregion if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir)
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{args.validation_split_percentage}%:]", ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) if args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] eval_dataset = lm_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather(loss.repeat( args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if (config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm): raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", level=logging.INFO, datefmt="[%X]", ) # Log on each process the small summary: logger = logging.getLogger(__name__) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Handle the repository creation if training_args.push_to_hub: if training_args.hub_model_id is None: repo_name = get_full_repo_name( Path(training_args.output_dir).absolute().name, token=training_args.hub_token ) else: repo_name = training_args.hub_model_id repo = Repository(training_args.output_dir, clone_from=repo_name) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, use_auth_token=True if model_args.use_auth_token else None, ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.config_name: config = T5Config.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer), use_auth_token=True if model_args.use_auth_token else None, ) elif model_args.model_name_or_path: config = T5Config.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # Since we make sure that all sequences are of the same length, no attention_mask is needed. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_attention_mask=False) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token. # To ensure that the input length is `max_seq_length`, we need to increase the maximum length # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly. expanded_inputs_length, targets_length = compute_input_and_target_lengths( inputs_length=max_seq_length, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= expanded_inputs_length: total_length = (total_length // expanded_inputs_length) * expanded_inputs_length # Split by chunks of max_len. result = { k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Enable tensorboard only on the master node has_tensorboard = is_tensorboard_available() if has_tensorboard and jax.process_index() == 0: try: from flax.metrics.tensorboard import SummaryWriter summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir)) except ImportError as ie: has_tensorboard = False logger.warning( f"Unable to display metrics through TensorBoard because some package are not installed: {ie}" ) else: logger.warning( "Unable to display metrics through TensorBoard because the package is not installed: " "Please run pip install tensorboard to enable." ) # Initialize our training rng = jax.random.PRNGKey(training_args.seed) dropout_rngs = jax.random.split(rng, jax.local_device_count()) if model_args.model_name_or_path: model = FlaxT5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) else: config.vocab_size = len(tokenizer) model = FlaxT5ForConditionalGeneration( config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype), use_auth_token=True if model_args.use_auth_token else None, ) # Data collator # This one will take care of randomly masking the tokens. data_collator = FlaxDataCollatorForT5MLM( tokenizer=tokenizer, noise_density=data_args.mlm_probability, mean_noise_span_length=data_args.mean_noise_span_length, input_length=max_seq_length, target_length=targets_length, pad_token_id=model.config.pad_token_id, decoder_start_token_id=model.config.decoder_start_token_id, ) # Store some constant num_epochs = int(training_args.num_train_epochs) train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count() num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs num_of_hosts = jax.process_count() current_host_idx = jax.process_index() # Create learning rate schedule warmup_fn = optax.linear_schedule( init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps ) decay_fn = optax.linear_schedule( init_value=training_args.learning_rate, end_value=0, transition_steps=num_train_steps - training_args.warmup_steps, ) linear_decay_lr_schedule_fn = optax.join_schedules( schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps] ) # We use Optax's "masking" functionality to not apply weight decay # to bias and LayerNorm scale parameters. decay_mask_fn returns a # mask boolean with the same structure as the parameters. # The mask is True for parameters that should be decayed. def decay_mask_fn(params): flat_params = traverse_util.flatten_dict(params) flat_mask = { path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")]) for path in flat_params } return traverse_util.unflatten_dict(flat_mask) # create adam optimizer if training_args.adafactor: # We use the default parameters here to initialize adafactor, # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74 optimizer = optax.adafactor( learning_rate=linear_decay_lr_schedule_fn, ) else: optimizer = optax.adamw( learning_rate=linear_decay_lr_schedule_fn, b1=training_args.adam_beta1, b2=training_args.adam_beta2, weight_decay=training_args.weight_decay, mask=decay_mask_fn, ) # Setup train state state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer) # Define gradient update step fn def train_step(state, batch, dropout_rng): dropout_rng, new_dropout_rng = jax.random.split(dropout_rng) def loss_fn(params): labels = batch.pop("labels") logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0] # compute loss loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean() return loss grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(state.params) grad = jax.lax.pmean(grad, "batch") new_state = state.apply_gradients(grads=grad) metrics = jax.lax.pmean( {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch" ) return new_state, metrics, new_dropout_rng # Create parallel version of the train step p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,)) # Define eval fn def eval_step(params, batch): labels = batch.pop("labels") logits = model(**batch, params=params, train=False)[0] # compute loss loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) # compute accuracy accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) # summarize metrics metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()} metrics = jax.lax.pmean(metrics, axis_name="batch") return metrics p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,)) # Replicate the train state on each device state = jax_utils.replicate(state) train_time = 0 epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0) for epoch in epochs: # ======================== Training ================================ train_start = time.time() train_metrics = [] # Create sampling rng rng, input_rng = jax.random.split(rng) # Generate an epoch by shuffling sampling indices from the train dataset num_train_samples = len(tokenized_datasets["train"]) train_samples_idx = np.random.permutation(np.arange(num_train_samples)) train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size) # Gather the indexes for creating the batch and do a training step for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)): samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples) local_host_model_inputs = { key: np.split(model_inputs.data[key], num_of_hosts, axis=0)[current_host_idx] for key, value in model_inputs.data.items() } # Model forward model_inputs = shard(local_host_model_inputs) state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs) train_metrics.append(train_metric) cur_step = epoch * (num_train_samples // train_batch_size) + step if cur_step % training_args.logging_steps == 0 and cur_step > 0: # Save metrics train_metric = jax_utils.unreplicate(train_metric) train_time += time.time() - train_start if has_tensorboard and jax.process_index() == 0: write_train_metric(summary_writer, train_metrics, train_time, cur_step) epochs.write( f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:" f" {train_metric['learning_rate'].mean()})" ) train_metrics = [] if cur_step % training_args.eval_steps == 0 and cur_step > 0: # ======================== Evaluating ============================== num_eval_samples = len(tokenized_datasets["validation"]) eval_samples_idx = jnp.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size) eval_metrics = [] for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples) # Model forward model_inputs = shard(model_inputs.data) metrics = p_eval_step(state.params, model_inputs) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(jnp.mean, eval_metrics) # Update progress bar epochs.write(f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})") # Save metrics if has_tensorboard and jax.process_index() == 0: write_eval_metric(summary_writer, eval_metrics, cur_step) if cur_step % training_args.save_steps == 0 and cur_step > 0: # save checkpoint after each epoch and push checkpoint to the hub if jax.process_index() == 0: params = jax.device_get(jax.tree_map(lambda x: x[0], state.params)) model.save_pretrained(training_args.output_dir, params=params) tokenizer.save_pretrained(training_args.output_dir) if training_args.push_to_hub: repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False) # Eval after training if training_args.do_eval: num_eval_samples = len(tokenized_datasets["validation"]) eval_samples_idx = jnp.arange(num_eval_samples) eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size) eval_metrics = [] for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)): samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx] model_inputs = data_collator(samples) # Model forward model_inputs = shard(model_inputs.data) metrics = p_eval_step(state.params, model_inputs) eval_metrics.append(metrics) # get eval metrics eval_metrics = get_metrics(eval_metrics) eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(), eval_metrics) if jax.process_index() == 0: eval_metrics = {f"eval_{metric_name}": value for metric_name, value in eval_metrics.items()} path = os.path.join(training_args.output_dir, "eval_results.json") with open(path, "w") as f: json.dump(eval_metrics, f, indent=4, sort_keys=True)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = (data_args.train_file.split(".")[-1] if data_args.train_file is not None else data_args.validation_file.split(".")[-1]) if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Setup adapters if adapter_args.train_adapter: task_name = data_args.dataset_name or "clm" # check if adapter already exists, otherwise add it if task_name not in model.config.adapters: # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity=adapter_args.adapter_non_linearity, reduction_factor=adapter_args.adapter_reduction_factor, ) # load a pre-trained from Hub if specified if adapter_args.load_adapter: model.load_adapter( adapter_args.load_adapter, config=adapter_config, load_as=task_name, ) # otherwise, add a fresh adapter else: model.add_adapter(task_name, config=adapter_config) # optionally load a pre-trained language adapter if adapter_args.load_lang_adapter: # resolve the language adapter config lang_adapter_config = AdapterConfig.load( adapter_args.lang_adapter_config, non_linearity=adapter_args.lang_adapter_non_linearity, reduction_factor=adapter_args.lang_adapter_reduction_factor, ) # load the language adapter from Hub lang_adapter_name = model.load_adapter( adapter_args.load_lang_adapter, config=lang_adapter_config, load_as=adapter_args.language, ) else: lang_adapter_name = None # Freeze all model weights except of those of this adapter model.train_adapter([task_name]) # Set the adapters to be used in every forward pass if lang_adapter_name: model.set_active_adapters([lang_adapter_name, task_name]) else: model.set_active_adapters([task_name]) else: if adapter_args.load_adapter or adapter_args.load_lang_adapter: raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training") # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warn( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, do_save_full_model=not adapter_args.train_adapter, do_save_adapters=adapter_args.train_adapter, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the # second column for the summaries (unless you specify column names for this with the `text_column` and # `summary_column` arguments). # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( data_args.target_lang is not None and data_args.source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names elif training_args.do_predict: column_names = datasets["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if data_args.task.startswith("translation") or isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if data_args.source_lang is not None: tokenizer.src_lang = data_args.source_lang if data_args.target_lang is not None: tokenizer.tgt_lang = data_args.target_lang # To serialize preprocess_function below, each of those four variables needs to be defined (even if we won't use # them all). source_lang, target_lang, text_column, summary_column = None, None, None, None if data_args.task.startswith("summarization"): # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) else: # Get the language codes for input/target. lang_search = re.match("translation_([a-z]+)_to_([a-z]+)", data_args.task) if data_args.source_lang is not None: source_lang = data_args.source_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a source language via --source_lang or rename your task 'translation_xx_to_yy'." source_lang = lang_search.groups()[0] if data_args.target_lang is not None: target_lang = data_args.target_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a target language via --target_lang or rename your task 'translation_xx_to_yy'." target_lang = lang_search.groups()[1] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warn( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): if data_args.task.startswith("translation"): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] else: inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: train_dataset = datasets["train"] if "train" not in datasets: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select(range(data_args.max_test_samples)) test_dataset = test_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric_name = "rouge" if data_args.task.startswith("summarization") else "sacrebleu" metric = load_metric(metric_name) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence if metric_name == "rouge": preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] else: # sacrebleu labels = [[label] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels) if metric_name == "rouge": result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = {key: value.mid.fmeasure * 100 for key, value in result.items()} else: result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval" ) max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Test ***") test_results = trainer.predict( test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = test_results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True ) test_preds = [pred.strip() for pred in test_preds] output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt") with open(output_test_preds_file, "w") as writer: writer.write("\n".join(test_preds)) return results
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data") # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = XLNetConfig.from_pretrained(args.model_name_or_path) tokenizer = XLNetTokenizerFast.from_pretrained(args.model_name_or_path) model = XLNetForQuestionAnswering.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. column_names = raw_datasets["train"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] tokenized_examples["is_impossible"] = [] tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others get 1.0. # The cls token gets 1.0 too (for predictions of empty answers). tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != context_idx: token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != context_idx: token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) tokenized_examples["is_impossible"].append(1.0) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) tokenized_examples["is_impossible"].append(0.0) return tokenized_examples if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on train dataset", ) if args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, return_special_tokens_mask=True, return_token_type_ids=True, padding="max_length", ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The special tokens will help us build the p_mask (which indicates the tokens that can't be in answers). special_tokens = tokenized_examples.pop("special_tokens_mask") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] # We still provide the index of the CLS token and the p_mask to the model, but not the is_impossible label. tokenized_examples["cls_index"] = [] tokenized_examples["p_mask"] = [] for i, input_ids in enumerate(tokenized_examples["input_ids"]): # Find the CLS token in the input ids. cls_index = input_ids.index(tokenizer.cls_token_id) tokenized_examples["cls_index"].append(cls_index) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples["token_type_ids"][i] for k, s in enumerate(special_tokens[i]): if s: sequence_ids[k] = 3 context_idx = 1 if pad_on_right else 0 # Build the p_mask: non special tokens and context gets 0.0, the others 1.0. tokenized_examples["p_mask"].append([ 0.0 if (not special_tokens[i][k] and s == context_idx) or k == cls_index else 1.0 for k, s in enumerate(sequence_ids) ]) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_idx else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(args.max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(args.max_eval_samples)) if args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select( range(args.max_predict_samples)) # Predict Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select( range(args.max_predict_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorWithPadding( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataset_for_model = eval_dataset.remove_columns( ["example_id", "offset_mapping"]) eval_dataloader = DataLoader(eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) if args.do_predict: predict_dataset_for_model = predict_dataset.remove_columns( ["example_id", "offset_mapping"]) predict_dataloader = DataLoader( predict_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions, scores_diff_json = postprocess_qa_predictions_with_beam_search( examples=examples, features=features, predictions=predictions, version_2_with_negative=args.version_2_with_negative, n_best_size=args.n_best_size, max_answer_length=args.max_answer_length, start_n_top=model.config.start_n_top, end_n_top=model.config.end_n_top, output_dir=args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": scores_diff_json[k] } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if args.version_2_with_negative else "squad") def create_and_fill_np_array(start_or_end_logits, dataset, max_len): """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor Args: start_or_end_logits(:obj:`tensor`): This is the output predictions of the model. We can only enter either start or end logits. eval_dataset: Evaluation dataset max_len(:obj:`int`): The maximum length of the output tensor. ( See the model.eval() part for more details ) """ step = 0 # create a numpy array and fill it with -100. logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float32) # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather for i, output_logit in enumerate( start_or_end_logits): # populate columns # We have to fill it such that we have to take the whole tensor and replace it on the newly created array # And after every iteration we have to change the step batch_size = output_logit.shape[0] cols = output_logit.shape[1] if step + batch_size < len(dataset): logits_concat[step:step + batch_size, :cols] = output_logit else: logits_concat[step:, :cols] = output_logit[:len(dataset) - step] step += batch_size return logits_concat # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes(cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append(accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, eval_dataset, max_len) start_top_index_concat = create_and_fill_np_array(all_start_top_index, eval_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, eval_dataset, max_len) end_top_index_concat = create_and_fill_np_array(all_end_top_index, eval_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(eval_examples, eval_dataset, outputs_numpy) eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Evaluation metrics: {eval_metric}") if args.do_predict: # intialize all lists to collect the batches all_start_top_log_probs = [] all_start_top_index = [] all_end_top_log_probs = [] all_end_top_index = [] all_cls_logits = [] for step, batch in enumerate(predict_dataloader): with torch.no_grad(): outputs = model(**batch) start_top_log_probs = outputs.start_top_log_probs start_top_index = outputs.start_top_index end_top_log_probs = outputs.end_top_log_probs end_top_index = outputs.end_top_index cls_logits = outputs.cls_logits if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered start_top_log_probs = accelerator.pad_across_processes( start_top_log_probs, dim=1, pad_index=-100) start_top_index = accelerator.pad_across_processes( start_top_index, dim=1, pad_index=-100) end_top_log_probs = accelerator.pad_across_processes( end_top_log_probs, dim=1, pad_index=-100) end_top_index = accelerator.pad_across_processes( end_top_index, dim=1, pad_index=-100) cls_logits = accelerator.pad_across_processes( cls_logits, dim=1, pad_index=-100) all_start_top_log_probs.append( accelerator.gather(start_top_log_probs).cpu().numpy()) all_start_top_index.append( accelerator.gather(start_top_index).cpu().numpy()) all_end_top_log_probs.append( accelerator.gather(end_top_log_probs).cpu().numpy()) all_end_top_index.append( accelerator.gather(end_top_index).cpu().numpy()) all_cls_logits.append( accelerator.gather(cls_logits).cpu().numpy()) max_len = max([x.shape[1] for x in all_end_top_log_probs ]) # Get the max_length of the tensor # concatenate all numpy arrays collected above start_top_log_probs_concat = create_and_fill_np_array( all_start_top_log_probs, predict_dataset, max_len) start_top_index_concat = create_and_fill_np_array( all_start_top_index, predict_dataset, max_len) end_top_log_probs_concat = create_and_fill_np_array( all_end_top_log_probs, predict_dataset, max_len) end_top_index_concat = create_and_fill_np_array( all_end_top_index, predict_dataset, max_len) cls_logits_concat = np.concatenate(all_cls_logits, axis=0) # delete the list of numpy arrays del start_top_log_probs del start_top_index del end_top_log_probs del end_top_index del cls_logits outputs_numpy = ( start_top_log_probs_concat, start_top_index_concat, end_top_log_probs_concat, end_top_index_concat, cls_logits_concat, ) prediction = post_processing_function(predict_examples, predict_dataset, outputs_numpy) predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids) logger.info(f"Predict metrics: {predict_metric}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names else: column_names = datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[ 0] context_column_name = "context" if "context" in column_names else column_names[ 1] answer_column_name = "answers" if "answers" in column_names else column_names[ 2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[ token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append( token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append( token_end_index + 1) return tokenized_examples if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select( range(data_args.max_train_samples)) # Validation preprocessing def prepare_validation_features(examples): # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[ question_column_name if pad_on_right else context_column_name], examples[ context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append( examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = datasets["validation"] if data_args.max_val_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select( range(data_args.max_val_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_val_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_examples = datasets["test"] if data_args.max_test_samples is not None: # We will select sample from whole data test_examples = test_examples.select( range(data_args.max_test_samples)) # Test Feature Creation test_dataset = test_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_test_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again test_dataset = test_dataset.select( range(data_args.max_test_samples)) # Data collator # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data # collator. data_collator = (default_data_collator if data_args.pad_to_max_length else DataCollatorWithPadding( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)) # Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, is_world_process_zero=trainer.is_world_process_zero(), prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [{ "id": k, "prediction_text": v, "no_answer_probability": 0.0 } for k, v in predictions.items()] else: formatted_predictions = [{ "id": k, "prediction_text": v } for k, v in predictions.items()] references = [{ "id": ex["id"], "answers": ex[answer_column_name] } for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric( "squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Initialize our Trainer trainer = QuestionAnsweringTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, post_process_function=post_processing_function, compute_metrics=compute_metrics, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(test_dataset, test_examples) metrics = results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len( test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) if training_args.push_to_hub: trainer.push_to_hub()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Load our custom dataset. # It's a little bit of a pain since we are loading from our own Dataset not from file or huggingface/datasets config = dataset_configs[data_args.dataset_name] torch_dataset, _, _ = get_datasets(config["dataset_name"], augmenter=None, train_size=data_args.train_size, val_size=config["val_size"], test_size=config["test_size"], augmentation_prob=0, load_test=config["load_test"], text_columns=config["text_colname"]) conditional_dataset = ConditionalDataset(torch_dataset, config=config, sep_token=tokenizer.sep_token) print(f"Created ConditionalDataset of size {len(conditional_dataset)}") print("Example from conditional dataset:", conditional_dataset[0]) # Converting to dict as a workaround to read it into huggingface's Dataset column_names = conditional_dataset.column_names dataset_dict = defaultdict(list) for row in conditional_dataset: for col in column_names: dataset_dict[col].append(row[col]) datasets = Dataset.from_dict(dataset_dict).train_test_split( test_size=data_args.test_size) print( f"Created huggingface Dataset: train size: {len(datasets['train'])}, test size: {len(datasets['test'])}" ) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["test"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["test"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) num_labels = 2 ## Defined in processor. Change labels before changing this (also hardcoded elsewhere) . output_mode = 'classification' # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels= 2, ## Defined in processor. Change labels before changing this (also hardcoded elsewhere) . # DEFAULT = 2 finetuning_task='mrpc', ## Not sure why we need this! # DEFAULT = cola cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) ############################################################################# # Set Class Weights ## ############################################################################# class_weights = None if not model_args.class_weights is None: class_weights = model_args.class_weights.lstrip().rstrip().split(',') class_weights = [float(i) for i in class_weights] logger.info("Using Class Weights {}".format(','.join( [str(i) for i in class_weights]))) class_weights = torch.tensor(class_weights, dtype=torch.float, device=training_args.device) model.set_class_weights(class_weights) ############################################################################# # Get datasets train_dataset = GlueDataset(data_args, tokenizer=tokenizer, train=True) if training_args.do_train else None eval_dataset = GlueDataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None test_dataset = GlueDataset( data_args, tokenizer=tokenizer) if training_args.do_predict else None if test_dataset is not None: for dataset in test_dataset: logger.info(dataset) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) # return glue_compute_metrics(data_args.task_name, preds, p.label_ids) with open(training_args.output_dir + 'predictions.csv', 'w') as fh: writer = csv.writer(fh) preds = [[i] for i in preds] writer.writerows(preds) return acc_and_f1(preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] # if data_args.task_name == "mnli": # mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") # eval_datasets.append(GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, evaluate=True)) for eval_dataset in eval_datasets: result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_classification.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results classification *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) # Prediction if training_args.do_predict: logger.info("*** Predict ***") result = trainer.predict(test_dataset=test_dataset) output_predict_file = os.path.join(training_args.output_dir, f"predict_results.txt") with open(output_predict_file, "w") as writer: logger.info("***** Predict results classification ******") logger.info(" %s = %s", result) for key, value in result: writer.write("%s, %s\n" % (key, value)) return results