def main(): args = parse_args() if args.source_prefix is None and args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `") # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(args.dataset_name, None) text_column_name = dataset_columns[ 0] if dataset_columns is not None else column_names[0] padding = "max_length" if args.pad_to_max_length else False if args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = examples[text_column_name] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs processed_datasets = raw_datasets.map( preprocess_function, batched=True, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 1): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metric metric = load_metric("rouge") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes( batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens = accelerator.gather( generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) result = metric.compute(use_stemmer=True) # Extract a few results from ROUGE result = { key: value.mid.fmeasure * 100 for key, value in result.items() } result = {k: round(v, 4) for k, v in result.items()} logger.info(result) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
from pytorch_transformers import AdamW from transformers import AutoTokenizer, AutoModelForSeq2SeqLM from torch import device, cuda EN_FILE = './data/corpus.en_ru.1m.en' RU_FILE = './data/corpus.en_ru.1m.ru' BATCH_SIZE = 32 device = device('cuda' if cuda.is_available() else 'cpu') tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ru-en") model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-ru-en").to( device) data_in = open('eval-ru-100.txt', 'r').readlines() input_ids = tokenizer.batch_encode_plus(data_in, return_tensors="pt", padding=True).data['input_ids'] outputs = model.generate(input_ids.to(device)) decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True) data_out = open('answer.txt', 'w') data_out.write('\n'.join(decoded)) data_out.close()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files this script will use the first column for the full texts and the second column for the # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names elif training_args.do_predict: column_names = datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warn( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: train_dataset = datasets["train"] if "train" not in datasets: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = load_metric("rouge") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) # Extract a few results from ROUGE result = { key: value.mid.fmeasure * 100 for key, value in result.items() } prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval") max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Test ***") test_results = trainer.predict( test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = test_results.metrics max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len( test_dataset) metrics["test_samples"] = min(max_test_samples, len(test_dataset)) trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = [pred.strip() for pred in test_preds] output_test_preds_file = os.path.join(training_args.output_dir, "test_generations.txt") with open(output_test_preds_file, "w") as writer: writer.write("\n".join(test_preds)) return results
def eval_data_dir( data_dir, save_dir: str, model_name: str, bs: int = 8, max_source_length: int = 1024, type_path="val", n_obs=None, fp16=False, task="summarization", local_rank=None, num_return_sequences=1, dataset_kwargs: Dict = None, prefix="", **generate_kwargs, ) -> Dict: """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json""" model_name = str(model_name) assert local_rank is not None torch.distributed.init_process_group(backend="nccl", rank=local_rank) save_dir = Path(save_dir) save_path = save_dir.joinpath(f"rank_{local_rank}_output.json") torch.cuda.set_device(local_rank) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() if fp16: model = model.half() # determine if we need to increase num_beams use_task_specific_params(model, task) # update config with task specific params num_beams = generate_kwargs.pop( "num_beams", model.config.num_beams) # AttributeError risk? if num_return_sequences > num_beams: num_beams = num_return_sequences tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. if max_source_length is None: max_source_length = tokenizer.model_max_length if prefix is None: prefix = prefix or getattr(model.config, "prefix", "") or "" ds = Seq2SeqDataset( tokenizer, data_dir, max_source_length, max_target_length=1024, type_path=type_path, n_obs=n_obs, prefix=prefix, **dataset_kwargs, ) # I set shuffle=True for a more accurate progress bar. # If all the longest samples are first, the prog bar estimate is too high at the beginning. sampler = ds.make_sortish_sampler(bs, distributed=True, add_extra_examples=False, shuffle=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn) results = [] for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device), num_return_sequences=num_return_sequences, num_beams=num_beams, **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) ids = batch["ids"] if num_return_sequences > 1: preds = chunks(preds, num_return_sequences ) # batch size chunks, each of size num_return_seq for i, pred in enumerate(preds): results.append(dict(pred=pred, id=ids[i].item())) save_json(results, save_path) return results, sampler.num_replicas
from sklearn.metrics.pairwise import cosine_similarity from sklearn.metrics.pairwise import euclidean_distances import nltk # nltk.download('punkt') # nltk.download('stopwords') # nltk.download('averaged_perceptron_tagger') # nltk.download('wordnet') from nltk.tokenize import sent_tokenize embed = hub.load('use') model = keras.models.load_model('model') # From google colab. 40 epochs 2**16 batchsize product_embeddings = pd.read_pickle('data/product_embeds.pkl') # Run google colab GPU to get the embeddings of all unique products in the dataset # Import model for summarization tokenizer_reddit = AutoTokenizer.from_pretrained("google/pegasus-reddit_tifu") model_reddit = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-reddit_tifu") MAX_LEN = 150 # For the summary length def euclidean_similarity(embed1, embed2): distance_matrix = euclidean_distances(embed1, embed2) similarity_matrix = 1 - distance_matrix / np.max(distance_matrix) return similarity_matrix def get_sentiment_scores(review, sentiment='positive'): """ Returns sentiment scores for each sentence in one review and embeddings of review sentences args:
def model(self): """Only load the model if needed.""" model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-en-ro").to(torch_device) if "cuda" in torch_device: model = model.half() return model
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) import pickle logger.info("Loading dataset...") with open(data_args.dataset_name, 'rb') as f: pkl_data = pickle.load(f) logger.info("Finish loading dataset...") bart_datasets = pkl_data['tokenized_datasets'] # Data collator # This one will take care of randomly masking the tokens and sentence permutation. data_collator = DataCollatorForBartLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=bart_datasets if training_args.do_train else None, eval_dataset=None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload
def test_integration_torch_conversation_blenderbot_400M_input_ids(self): tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") model = AutoModelForSeq2SeqLM.from_pretrained("facebook/blenderbot-400M-distill") conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) # test1 conversation_1 = Conversation("hello") inputs = conversation_agent._parse_and_tokenize([conversation_1]) self.assertEqual(inputs["input_ids"].tolist(), [[1710, 86, 2]]) # test2 conversation_1 = Conversation( "I like lasagne.", past_user_inputs=["hello"], generated_responses=[ " Do you like lasagne? It is a traditional Italian dish consisting of a shepherd's pie." ], ) inputs = conversation_agent._parse_and_tokenize([conversation_1]) self.assertEqual( inputs["input_ids"].tolist(), [ # This should be compared with the same conversation on ParlAI `safe_interactive` demo. [ 1710, # hello 86, 228, # Double space 228, 946, 304, 398, 6881, 558, 964, 38, 452, 315, 265, 6252, 452, 322, 968, 6884, 3146, 278, 306, 265, 617, 87, 388, 75, 341, 286, 521, 21, 228, # Double space 228, 281, # I like lasagne. 398, 6881, 558, 964, 21, 2, # EOS ] ], )
def main(): parser = MyArgumentParser((InferenceArguments, )) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. (args, ) = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: (args, ) = parser.parse_args_into_dataclasses() params = dict( pretrained_model_name_or_path=args.model_name_or_path, cache_dir=args.cache_dir, ) config = AutoConfig.from_pretrained(**params) tokenizer = AutoTokenizer.from_pretrained(**params) model = AutoModelForSeq2SeqLM.from_pretrained(config=config, **params) if args.model_parameters: print("====== MODEL PARAMETER LOADING... ======\n" f" {args.model_parameters}") model.load_state_dict(torch.load(args.model_parameters)) max_length = args.test_max_target_length # set num_beams for evaluation num_beams = args.num_beams if args.num_beams else model.config.num_beams test_dataset = Seq2SeqDataset( tokenizer=tokenizer, type_path='test', data_dir=args.data_dir, max_target_length=args.test_max_target_length, max_source_length=args.max_source_length, ) test_sampler = SequentialSampler(test_dataset) data_collator = Seq2SeqDataCollator(tokenizer, args) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=args.per_device_test_batch_size, collate_fn=data_collator, drop_last=False, ) # prediction_loop description = "Prediction" batch_size = test_dataloader.batch_size num_examples = len(test_dataloader.dataset) print(f"***** Running {description} *****") print(f" Num examples = {num_examples}") print(f" Batch size = {batch_size}") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) res = [] for step, inputs in enumerate(test_dataloader): # prediction_step, generative based has_labels = "labels" in inputs # False # _prepare_inputs # 1. device로 보내기 # 2. memory에 _past 올리기 for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) gen_kwargs = {"max_length": max_length, "num_beams": num_beams} generated_tokens = model.generate( inputs['input_ids'], attention_mask=inputs['attention_mask'], **gen_kwargs, ) # in case the batch is shorter than max length, the output should be padded if generated_tokens.shape[-1] < gen_kwargs["max_length"]: # If PAD token is not defined at least EOS token has to be defined padded_tensor = tokenizer.pad_token_id * torch.ones( (generated_tokens.shape[0], gen_kwargs["max_length"]), dtype=generated_tokens.dtype, device=generated_tokens.device, ) padded_tensor[:, :generated_tokens.shape[-1]] = generated_tokens generated_tokens = padded_tensor loss = None labels = None res.extend(list(generated_tokens)) submit(args, tokenizer, res) print("Finished!")
'--bart', type=int, action='store', default=0, help='use BART base (1) or BART large (2) as abstractor') parser.add_argument('--clip', type=int, action='store', default=-1, help='max summary sentences to clip at') args = parser.parse_args() args.cuda = torch.cuda.is_available() and not args.no_cuda if args.bart == 1: tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base") bart_model = AutoModelForSeq2SeqLM.from_pretrained( "/exp/yashgupta/transformers/examples/seq2seq/absm_cnn_bart_eval/") torch_device = 'cuda' if args.cuda else 'cpu' #torch.cuda.is_available() bart_model.to(torch_device) elif args.bart == 2: tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") bart_model = AutoModelForSeq2SeqLM.from_pretrained( "/exp/yashgupta/transformers/examples/seq2seq/absm_cnn_bart_large/" ) torch_device = 'cuda' if args.cuda else 'cpu' #torch.cuda.is_available() bart_model.to(torch_device) data_split = 'test' if args.test else 'val' decode(args.path, args.model_dir, data_split, args.batch,
import os import random from .utils import split_into_sentences from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Import the model and tokenizer tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").to( "cpu") def paraphrase_text(text): paraphrased_sentences = [] paraphrased_text = "" sentences = split_into_sentences(text) print(sentences) for sentence in sentences: lines = [] sentence = "paraphrase: " + sentence encoding = tokenizer.encode_plus(sentence, padding=True, return_tensors="pt") input_ids, attention_masks = encoding["input_ids"].to( "cpu"), encoding["attention_mask"].to("cpu") outputs = model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length=256, do_sample=True, top_k=150, top_p=0.99, early_stopping=True,
def create_student_by_copying_alternating_layers( teacher: Union[str, PreTrainedModel], save_path: Union[str, Path] = "student", e: Union[int, None] = None, d: Union[int, None] = None, copy_first_teacher_layers=False, **extra_config_kwargs ) -> Tuple[PreTrainedModel, List[int], List[int]]: """Make a student by copying alternating layers from a teacher, save it to save_path. Args: teacher: str or PreTrainedModel if str, this will call AutoModelForSeq2SeqLM.from_pretrained(teacher) before copying layers save_path: where to save the student, defaults to student directory. e: how many Encoder layers should the student have, default is fully copy of teacher d: how many Decoder layers should the student have, default is fully copy of teacher copy_first_teacher_layers: [bool] dont copy alternating layers, just the first e/d. **extra_config_kwargs: extra kwargs to pass to the student, by default the teacher config is used. Returns: student: new, smaller model. (Also saves it to save_path) e_layers_to_copy: list of which teacher encoder layers were used d_layers_to_copy: list of which teacher decoder layers were used """ _msg = "encoder_layers and decoder_layers cannot be both None-- you would just have an identical teacher." assert (e is not None) or (d is not None), _msg if isinstance(teacher, str): AutoTokenizer.from_pretrained(teacher).save_pretrained(save_path) # purely for convenience teacher = AutoModelForSeq2SeqLM.from_pretrained(teacher).eval() else: assert isinstance(teacher, PreTrainedModel), f"teacher must be a model or string got type {type(teacher)}" init_kwargs = teacher.config.to_diff_dict() try: teacher_e, teacher_d = teacher.config.encoder_layers, teacher.config.decoder_layers if e is None: e = teacher_e if d is None: d = teacher_d init_kwargs.update({"encoder_layers": e, "decoder_layers": d}) except AttributeError: # T5 teacher_e, teacher_d = teacher.config.num_layers, teacher.config.num_decoder_layers if e is None: e = teacher_e if d is None: d = teacher_d init_kwargs.update({"num_layers": e, "num_decoder_layers": d}) # Kwargs to instantiate student: teacher kwargs with updated layer numbers + **extra_config_kwargs init_kwargs.update(extra_config_kwargs) # Copy weights student_cfg = teacher.config_class(**init_kwargs) student = AutoModelForSeq2SeqLM.from_config(student_cfg) # Start by copying the full teacher state dict this will copy the first N teacher layers to the student. info = student.load_state_dict(teacher.state_dict(), strict=False) assert info.missing_keys == [], info.missing_keys # every student key should have a teacher keys. if copy_first_teacher_layers: # Our copying is done. We just log and save e_layers_to_copy, d_layers_to_copy = list(range(e)), list(range(d)) logger.info( f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}" ) student.save_pretrained(save_path) return student, e_layers_to_copy, d_layers_to_copy # Decide which layers of the teacher to copy. Not exactly alternating -- we try to keep first and last layer. e_layers_to_copy: List[int] = pick_layers_to_copy(e, teacher_e) d_layers_to_copy: List[int] = pick_layers_to_copy(d, teacher_d) try: copy_layers(teacher.model.encoder.layers, student.model.encoder.layers, e_layers_to_copy) copy_layers(teacher.model.decoder.layers, student.model.decoder.layers, d_layers_to_copy) except AttributeError: # For t5, student.model.encoder.layers is called student.encoder.block copy_layers(teacher.encoder.block, student.encoder.block, e_layers_to_copy) copy_layers(teacher.decoder.block, student.decoder.block, d_layers_to_copy) logger.info( f"Copied encoder layers {e_layers_to_copy} and decoder layers {d_layers_to_copy}. Saving them to {save_path}" ) student.config.init_metadata = dict( teacher_type=teacher.config.model_type, copied_encoder_layers=e_layers_to_copy, copied_decoder_layers=d_layers_to_copy, ) student.save_pretrained(save_path) # Save information about copying for easier reproducibility return student, e_layers_to_copy, d_layers_to_copy
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. args = BaseArguments() args = args.parse_args() parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_json_file(json_file=args.config_file) assert model_args.model_type in list(MODEL_TYPE_TO_TOKENIZER.keys()), "model type should be 't5' or 'bart' or gpt2" if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer tokenizer_cls = MODEL_TYPE_TO_TOKENIZER[model_args.model_type] tokenizer = tokenizer_cls.from_pretrained( model_args.tokenizer_name_or_path if model_args.tokenizer_name_or_path else model_args.model_name_or_path, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, ) # Extend the emb dim for special toks model.resize_token_embeddings(len(tokenizer)) if model_args.freeze_embeds: logger.info("freezing embeddings of the model") freeze_embeds(model) assert_not_all_frozen(model) # Get datasets logger.info('loading dataset') train_dataset = torch.load('data/' + data_args.train_file_path) if training_args.do_train else None eval_dataset = torch.load('data/' + data_args.eval_file_path) if training_args.do_eval else None # breakpoint() logger.info('finished loading dataset') # Initialize data_collator data_collator = T2TDataCollator( tokenizer=tokenizer, model_type=model_args.model_type, mode="training", using_tpu=training_args.tpu_num_cores is not None ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, prediction_loss_only=True, label_smoothing=model_args.label_smoothing ) # breakpoint() # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) logger.info('Results {}'.format(results)) return results
def __init__(self): self.url = "https://vz.ru/" model_name = 'Helsinki-NLP/opus-mt-ru-en' self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset if hasattr( tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, config=config, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, data_args=data_args, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() if trainer.is_world_process_zero(): logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) save_json( result, os.path.join(training_args.output_dir, "eval_results.json")) eval_results.update(result) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) test_metrics = { k.replace("eval", "test"): v for k, v in test_output.metrics.items() } if trainer.is_world_process_zero(): logger.info("***** Test results *****") for key, value in test_metrics.items(): logger.info(" %s = %s", key, value) save_json( test_metrics, os.path.join(training_args.output_dir, "test_results.json")) eval_results.update(test_metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(eval_results, "all_results.json") return eval_results
def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str( save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params( teacher, hparams.task ) # We copy good generation parameters to student by default if hparams.student is not None: student = AutoModelForSeq2SeqLM.from_pretrained(hparams.student) use_task_specific_params(student, hparams.task) e_layer_ids, d_layer_ids = None, None else: student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty hparams.tokenizer_name = hparams.teacher # Use teacher's tokenizer super().__init__(hparams, model=student, config=student.config) assert ( student.config.model_type == teacher.config.model_type ), f"teacher, student model types should be the same, got {student.config.model_type} != {teacher.config.model_type}" if student.config.model_type == "t5": student_encoder_layers = len(student.get_encoder().block) student_decoder_layers = len(student.get_decoder().block) teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: student_encoder_layers = student.config.encoder_layers student_decoder_layers = student.config.decoder_layers teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_base_models = not (hparams.student is None or hparams.teacher == hparams.student) self.do_calc_hidden_loss = ( not self.different_base_models) and hparams.alpha_hid > 0 self.different_encoder = self.different_base_models or ( student_encoder_layers != teacher_encoder_layers) # self.different_encoder determines whether we need to run the teacher encoder self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder if e_layer_ids is None: e_layer_ids = list(range(student_encoder_layers)) if d_layer_ids is None: d_layer_ids = list(range(student_decoder_layers)) self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if self.do_calc_hidden_loss: # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise( n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise( n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids else: self.e_matches = None self.d_matches = None self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()
def pipeline( task: str, model: Optional = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, use_cuda: Optional[bool] = True, ) -> K2TPipeline: """ :param task: (:obj:`str`): The task defining which pipeline will be returned. Currently accepted tasks are: - :obj:`"k2t"`: will return a :class:`K2TPipeline` which is based on the k2t model based on t5-small - :obj:`"k2t-tiny"`: will return a :class:`K2TPipeline` which is based on the k2t model based on t5-tiny - :obj:`"k2t-base"`: will return a :class:`K2TPipeline` which is based on the k2t model based on t5-base :param model: (:obj:`str` or `optional`): The model that will be used by the pipeline to make predictions. If not provided, the default for the :obj:`task` will be loaded. :param tokenizer: (:obj:`str` or `optional`): The tokenizer that will be used by the pipeline to encode data for the model. This can be a model identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`. If not provided, the default tokenizer for the given :obj:`model` will be loaded (if it is a string). :param use_cuda: (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to use a GPU or not Default: True :return: (:class:): `K2TPipeline`: A Keytotext pipeline for the task. """ if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format( task, list(SUPPORTED_TASKS.keys()))) targeted_task = SUPPORTED_TASKS[task] task_class = targeted_task["impl"] if model is None: model = targeted_task["default"]["model"] if tokenizer is None: if isinstance(model, str): tokenizer = model else: # Impossible to guest what is the right tokenizer here raise Exception( "Please provided a PretrainedTokenizer " "class or a path/identifier to a pretrained tokenizer.") if isinstance(tokenizer, (str, tuple)): tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate model if needed if isinstance(model, str): model = AutoModelForSeq2SeqLM.from_pretrained(model) if task == "k2t": return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda) if task == "k2t-base": return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda) if task == "mrm8488/t5-base-finetuned-common_gen": return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda)
# -*- coding: utf-8 -*- # @Time : 2021/2/8 11:28 # @Author : Jclian91 # @File : en_zh_mt.py # @Place : Yangpu, Shanghai import shap from transformers import AutoTokenizer, AutoModelForSeq2SeqLM tokenizer = AutoTokenizer.from_pretrained("./opus-mt-en-zh") model = AutoModelForSeq2SeqLM.from_pretrained("./opus-mt-en-zh") text = "In terms of time, the Chinese space station was built more than 20 years later than the International Space Station." # Tokenize the text batch = tokenizer.prepare_seq2seq_batch(src_texts=[text]) # Make sure that the tokenized text does not exceed the maximum # allowed size of 512 batch["input_ids"] = batch["input_ids"][:, :512] batch["attention_mask"] = batch["attention_mask"][:, :512] # Perform the translation and decode the output translation = model.generate(**batch) result = tokenizer.batch_decode(translation, skip_special_tokens=True) print(result)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the # second column for the summaries (unless you specify column names for this with the `text_column` and # `summary_column` arguments). # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) # Get the default prefix if None is passed. if data_args.source_prefix is None: task_specific_params = model.config.task_specific_params if task_specific_params is not None: prefix = task_specific_params.get("prefix", "") else: prefix = "" else: prefix = data_args.source_prefix # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if data_args.task.startswith("translation"): if data_args.source_lang is not None: tokenizer.src_lang = data_args.source_lang if data_args.target_lang is not None: tokenizer.tgt_lang = data_args.target_lang # To serialize preprocess_function below, each of those four variables needs to be defined (even if we won't use # them all). source_lang, target_lang, text_column, summary_column = None, None, None, None if data_args.task.startswith("summarization"): # Get the column names for input/target. dataset_columns = summarization_name_mapping.get( data_args.dataset_name, None) if data_args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = data_args.text_column if data_args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = data_args.summary_column else: # Get the language codes for input/target. lang_search = re.match("translation_([a-z]+)_to_([a-z]+)", data_args.task) if data_args.source_lang is not None: source_lang = data_args.source_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a source language via --source_lang or rename your task 'translation_xx_to_yy'." source_lang = lang_search.groups()[0] if data_args.target_lang is not None: target_lang = data_args.target_lang.split("_")[0] else: assert ( lang_search is not None ), "Provide a target language via --target_lang or rename your task 'translation_xx_to_yy'." target_lang = lang_search.groups()[1] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warn( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): if data_args.task.startswith("translation"): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] else: inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: max_target_length = data_args.val_max_target_length eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric_name = "rouge" if data_args.task.startswith( "summarization") else "sacrebleu" metric = load_metric(metric_name) def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [label.strip() for label in decoded_labels] if metric_name == "sacrebleu": decoded_labels = [[label] for label in decoded_labels] result = metric.compute(predictions=decoded_preds, references=decoded_labels) # Extract a few results from ROUGE if metric_name == "rouge": result = { key: value.mid.fmeasure * 100 for key, value in result.items() } else: result = {"bleu": result["score"]} prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_seq2seq.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in sorted(results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def get_model(self) -> AutoModelForSeq2SeqLM: # pylint: disable=no-value-for-parameter return AutoModelForSeq2SeqLM.from_pretrained(self.get_model_path(), local_files_only = True)
def model(self): return AutoModelForSeq2SeqLM.from_pretrained( self.checkpoint_name).to(torch_device)
from torchtext.datasets import Multi30k import torch import torch.nn.functional as F def distillation(y, teacher_scores, labels, T, alpha): p = F.log_softmax(y/T, dim=1) q = F.softmax(teacher_scores/T, dim=1) l_kl = F.kl_div(p, q, size_average=False) * (T**2) / y.shape[0] l_ce = F.cross_entropy(y, labels) return l_kl * alpha + l_ce * (1. - alpha) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') german = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_de_en", pad_token="<pad>", eos_token="</s>", bos_token="<s>", unk_token="<unk>") english = AutoTokenizer.from_pretrained("google/bert2bert_L-24_wmt_en_de", pad_token="<pad>", eos_token="</s>", bos_token="<s>", unk_token="<unk>") model = AutoModelForSeq2SeqLM.from_pretrained("google/bert2bert_L-24_wmt_de_en").to(device) sentence = "Willst du einen Kaffee trinken gehen mit mir?" inputs = german(sentence, return_tensors="pt") labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 logits, a = model(input_ids=inputs.input_ids, decoder_input_ids=labels) print(logits.shape, a.shape) out = model.generate(input_ids=inputs.input_ids, output_hidden_states=True) print(out) # print(english.decode(model.generate(), skip_special_tokens=True)) # model. # print(model(input_ids=input_ids)) # print(tokenizer.decode(output_ids, skip_special_tokens=True)) # # student_model =
def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str( save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params( teacher, hparams.task ) # We copy good generation parameters to student by default student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty super().__init__(hparams, model=student, config=student.config) model_type = student.config.model_type self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if model_type == "t5": teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_encoder = hparams.student_encoder_layers != teacher_encoder_layers self.different_decoder = hparams.student_decoder_layers != teacher_decoder_layers self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise( n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise( n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()
def test_rag_sequence_from_pretrained(self): rag_config = self.get_rag_config() rag_decoder_tokenizer = BartTokenizer.from_pretrained( "facebook/bart-large-cnn") rag_question_encoder_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") rag_retriever = RagRetriever( rag_config, question_encoder_tokenizer=rag_question_encoder_tokenizer, generator_tokenizer=rag_decoder_tokenizer, ) input_ids = rag_question_encoder_tokenizer( "who sings does he love me with reba", return_tensors="pt").input_ids decoder_input_ids = rag_decoder_tokenizer( "Linda Davis", return_tensors="pt").input_ids input_ids = input_ids.to(torch_device) decoder_input_ids = decoder_input_ids.to(torch_device) with tempfile.TemporaryDirectory() as tmp_dirname: rag_sequence = RagSequenceForGeneration.from_pretrained_question_encoder_generator( "facebook/dpr-question_encoder-single-nq-base", "facebook/bart-large-cnn", retriever=rag_retriever, config=rag_config, ).to(torch_device) # check that the from pretrained methods work rag_sequence.save_pretrained(tmp_dirname) rag_sequence.from_pretrained(tmp_dirname, retriever=rag_retriever) rag_sequence.to(torch_device) with torch.no_grad(): output = rag_sequence( input_ids, labels=decoder_input_ids, ) loss_pretrained = output.loss del rag_sequence question_encoder = AutoModel.from_pretrained( "facebook/dpr-question_encoder-single-nq-base") generator = AutoModelForSeq2SeqLM.from_pretrained( "facebook/bart-large-cnn") rag_sequence = RagSequenceForGeneration( config=rag_config, question_encoder=question_encoder, generator=generator, retriever=rag_retriever) rag_sequence.to(torch_device) with torch.no_grad(): output = rag_sequence( input_ids, labels=decoder_input_ids, ) loss_init = output.loss self.assertAlmostEqual(loss_pretrained.item(), loss_init.item(), places=4)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_node_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") if data_args.source_prefix is None and model_args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with " "`--source_prefix 'translate English to German: ' `") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own JSON training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For translation, only JSON files are supported, with one field named "translation" containing two keys for the # source and target languages (unless you adapt what follows). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = data_args.source_prefix if data_args.source_prefix is not None else "" # Preprocessing the datasets. # We need to tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: logger.info( "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`." ) return # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)): assert data_args.target_lang is not None and data_args.source_lang is not None, ( f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and " "--target_lang arguments.") tokenizer.src_lang = data_args.source_lang tokenizer.tgt_lang = data_args.target_lang # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument. forced_bos_token_id = ( tokenizer.lang_code_to_id[data_args.forced_bos_token] if data_args.forced_bos_token is not None else None) model.config.forced_bos_token_id = forced_bos_token_id # Get the language codes for input/target. source_lang = data_args.source_lang.split("_")[0] target_lang = data_args.target_lang.split("_")[0] # Temporarily set max_target_length for training. max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr( model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if training_args.do_eval: max_target_length = data_args.val_max_target_length if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) eval_dataset = eval_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if training_args.do_predict: max_target_length = data_args.val_max_target_length if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = raw_datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select( range(data_args.max_predict_samples)) predict_dataset = predict_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id if data_args.pad_to_max_length: data_collator = default_data_collator else: data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) # Metric metric = load_metric("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels def compute_metrics(eval_preds): preds, labels = eval_preds if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) if data_args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) result = metric.compute(predictions=decoded_preds, references=decoded_labels) result = {"bleu": result["score"]} prediction_lens = [ np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds ] result["gen_len"] = np.mean(prediction_lens) result = {k: round(v, 4) for k, v in result.items()} return result # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) if training_args.do_predict: logger.info("*** Predict ***") predict_results = trainer.predict( predict_dataset, metric_key_prefix="predict", max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, ) metrics = predict_results.metrics max_predict_samples = (data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if trainer.is_world_process_zero(): if training_args.predict_with_generate: predictions = tokenizer.batch_decode( predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) predictions = [pred.strip() for pred in predictions] output_prediction_file = os.path.join( training_args.output_dir, "generated_predictions.txt") with open(output_prediction_file, "w", encoding="utf-8") as writer: writer.write("\n".join(predictions)) if training_args.push_to_hub: kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "translation" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name languages = [ l for l in [data_args.source_lang, data_args.target_lang] if l is not None ] if len(languages) > 0: kwargs["language"] = languages trainer.push_to_hub(**kwargs) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_seq2seq_qa", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") # Preprocessing the datasets. # We need to generate and tokenize inputs and targets. if training_args.do_train: column_names = raw_datasets["train"].column_names elif training_args.do_eval: column_names = raw_datasets["validation"].column_names elif training_args.do_predict: column_names = raw_datasets["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return # Get the column names for input/target. dataset_columns = question_answering_column_name_mapping.get(data_args.dataset_name, None) if data_args.question_column is None: question_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: question_column = data_args.question_column if question_column not in column_names: raise ValueError( f"--question_column' value '{data_args.question_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.context_column is None: context_column = dataset_columns[1] if dataset_columns is not None else column_names[1] else: context_column = data_args.context_column if context_column not in column_names: raise ValueError( f"--context_column' value '{data_args.context_column}' needs to be one of: {', '.join(column_names)}" ) if data_args.answer_column is None: answer_column = dataset_columns[2] if dataset_columns is not None else column_names[2] else: answer_column = data_args.answer_column if answer_column not in column_names: raise ValueError( f"--answer_column' value '{data_args.answer_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_answer_length for training. max_answer_length = data_args.max_answer_length padding = "max_length" if data_args.pad_to_max_length else False if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): logger.warning( "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" ) if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_squad_batch( examples, question_column: str, context_column: str, answer_column: str, ) -> Tuple[List[str], List[str]]: questions = examples[question_column] contexts = examples[context_column] answers = examples[answer_column] def generate_input(_question, _context): return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()]) inputs = [generate_input(question, context) for question, context in zip(questions, contexts)] targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers] return inputs, targets def preprocess_function(examples): inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs # Validation preprocessing def preprocess_validation_function(examples): inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column) model_inputs = tokenizer( inputs, max_length=max_seq_length, padding=padding, truncation=True, return_overflowing_tokens=True, return_offsets_mapping=True, ) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = model_inputs.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. model_inputs["example_id"] = [] for i in range(len(model_inputs["input_ids"])): # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] model_inputs["example_id"].append(examples["id"][sample_index]) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and data_args.ignore_pad_token_for_loss: labels["input_ids"] = [ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] ] model_inputs["labels"] = labels["input_ids"] return model_inputs if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) # Create train feature from dataset with training_args.main_process_first(desc="train dataset map pre-processing"): train_dataset = train_dataset.map( preprocess_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on train dataset", ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = raw_datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data max_eval_samples = min(len(eval_examples), data_args.max_eval_samples) eval_examples = eval_examples.select(range(max_eval_samples)) # Validation Feature Creation with training_args.main_process_first(desc="validation dataset map pre-processing"): eval_dataset = eval_examples.map( preprocess_validation_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on validation dataset", ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) if training_args.do_predict: if "test" not in raw_datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = raw_datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select(range(data_args.max_predict_samples)) # Predict Feature Creation with training_args.main_process_first(desc="prediction dataset map pre-processing"): predict_dataset = predict_examples.map( preprocess_validation_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on prediction dataset", ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples) predict_dataset = predict_dataset.select(range(max_predict_samples)) # Data collator label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if training_args.fp16 else None, ) metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # Post-processing: def post_processing_function( examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval" ): # Decode the predicted tokens. preds = outputs.predictions if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Build a map example to its corresponding features. example_id_to_index = {k: i for i, k in enumerate(examples["id"])} feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)} predictions = {} # Let's loop over all the examples! for example_index, example in enumerate(examples): # This is the index of the feature associated to the current example. feature_index = feature_per_example[example_index] predictions[example["id"]] = decoded_preds[feature_index] # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) # Initialize our Trainer trainer = QuestionAnsweringSeq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, eval_examples=eval_examples if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, post_process_function=post_processing_function, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation results = {} max_length = ( training_args.generation_max_length if training_args.generation_max_length is not None else data_args.val_max_answer_length ) num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Prediction if training_args.do_predict: logger.info("*** Predict ***") results = trainer.predict(predict_dataset, predict_examples) metrics = results.metrics max_predict_samples = ( data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset) ) metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset)) trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) if training_args.push_to_hub: kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is not None: model.config.num_beams = data_args.eval_beams assert model.config.num_beams >= 1, f"got eval_beams={model.config.num_beams}. Need an integer >= 1" # set max length for generation model.config.max_generate_length = data_args.val_max_target_length # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang] model.config.decoder_start_token_id = decoder_start_token_id def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def non_pad_len(tokens: np.ndarray) -> int: return np.count_nonzero(tokens != tokenizer.pad_token_id) def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]: pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True) label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) pred_str = lmap(str.strip, pred_str) label_str = lmap(str.strip, label_str) return pred_str, label_str def summarization_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) rouge: Dict = calculate_rouge(pred_str, label_str) summ_len = np.mean(lmap(non_pad_len, pred.predictions)) rouge.update({"gen_len": summ_len}) return rouge def translation_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) bleu: Dict = calculate_bleu(pred_str, label_str) gen_len = np.mean(lmap(non_pad_len, pred.predictions)) bleu.update({"gen_len": gen_len}) return bleu compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics return compute_metrics_fn def freeze_embeds(model: torch.nn.Module): """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5.""" try: freeze_params(model.model.shared) for d in [model.model.encoder, model.model.decoder]: freeze_params(d.embed_positions) freeze_params(d.embed_tokens) except AttributeError: freeze_params(model.shared) for d in [model.encoder, model.decoder]: freeze_params(d.embed_tokens) if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset if hasattr( tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=build_compute_metrics_fn(data_args.task) if training_args.predict_with_generate else None, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.json") if trainer.is_world_process_zero(): logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) with open(output_eval_file, "w") as f: json.dump(result, f) eval_results.update(result) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) test_metrics = test_output.metrics test_metrics = { k.replace("eval", "test"): v for k, v in test_metrics.items() } output_test_file = os.path.join(training_args.output_dir, "test_results.json") if trainer.is_world_process_zero(): logger.info("***** Test results *****") for key, value in test_metrics.items(): logger.info(" %s = %s", key, value) with open(output_test_file, "w") as f: json.dump(test_metrics, f) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode(test_output.predictions, skip_special_tokens=True) test_preds = lmap(str.strip, test_preds) output_test_pred_file = os.path.join(training_args.output_dir, "test_generations.txt") with open(output_test_pred_file, "w") as f: f.write("\n".join(test_preds)) return eval_results
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_summarization_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = (Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()) if args.source_prefix is None and args.model_name_or_path in [ "t5-small", "t5-base", "t5-large", "t5-3b", "t5-11b", ]: logger.warning( "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with " "`--source_prefix 'summarize: ' `") # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # Get the column names for input/target. dataset_columns = summarization_name_mapping.get(args.dataset_name, None) if args.text_column is None: text_column = dataset_columns[ 0] if dataset_columns is not None else column_names[0] else: text_column = args.text_column if text_column not in column_names: raise ValueError( f"--text_column' value '{args.text_column}' needs to be one of: {', '.join(column_names)}" ) if args.summary_column is None: summary_column = dataset_columns[ 1] if dataset_columns is not None else column_names[1] else: summary_column = args.summary_column if summary_column not in column_names: raise ValueError( f"--summary_column' value '{args.summary_column}' needs to be one of: {', '.join(column_names)}" ) # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = examples[text_column] targets = examples[summary_column] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Tokenize targets with the `text_target` keyword argument labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs with accelerator.main_process_first(): processed_datasets = raw_datasets.map( preprocess_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 1): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [label.strip() for label in labels] # rougeLSum expects newline after each sentence preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds] labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels] return preds, labels train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("summarization_no_trainer", experiment_config) # Metric metric = evaluate.load("rouge") # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes( batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens, labels = accelerator.gather( (generated_tokens, labels)) generated_tokens = generated_tokens.cpu().numpy() labels = labels.cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) if isinstance(generated_tokens, tuple): generated_tokens = generated_tokens[0] decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: decoded_preds = decoded_preds[:len(eval_dataloader. dataset) - samples_seen] decoded_labels = decoded_labels[:len(eval_dataloader. dataset) - samples_seen] else: samples_seen += len(decoded_labels) metric.add_batch( predictions=decoded_preds, references=decoded_labels, ) result = metric.compute(use_stemmer=True) result = {k: round(v * 100, 4) for k, v in result.items()} logger.info(result) if args.with_tracking: result["train_loss"] = total_loss.item() / len(train_dataloader) result["epoch"] = epoch result["step"] = completed_steps accelerator.log(result, step=completed_steps) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump( { "eval_rouge1": result["rouge1"], "eval_rouge2": result["rouge2"], "eval_rougeL": result["rougeL"], "eval_rougeLsum": result["rougeLsum"], }, f, )
def pipeline( task: str, model: Optional = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, qg_format: Optional[str] = "highlight", ans_model: Optional = None, ans_tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, use_cuda: Optional[bool] = True, **kwargs, ): # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) targeted_task = SUPPORTED_TASKS[task] task_class = targeted_task["impl"] # Use default model/config/tokenizer for the task if no model is provided if model is None: model = targeted_task["default"]["model"] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: if isinstance(model, str): tokenizer = model else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer." ) # Instantiate tokenizer if needed if isinstance(tokenizer, (str, tuple)): if isinstance(tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1]) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer) # Instantiate model if needed if isinstance(model, str): model = AutoModelForSeq2SeqLM.from_pretrained(model) if task == "question-generation": if ans_model is None: # load default ans model ans_model = targeted_task["default"]["ans_model"] ans_tokenizer = AutoTokenizer.from_pretrained(ans_model) ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model) else: # Try to infer tokenizer from model or config name (if provided as str) if ans_tokenizer is None: if isinstance(ans_model, str): ans_tokenizer = ans_model else: # Impossible to guest what is the right tokenizer here raise Exception( "Impossible to guess which tokenizer to use. " "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer." ) # Instantiate tokenizer if needed if isinstance(ans_tokenizer, (str, tuple)): if isinstance(ans_tokenizer, tuple): # For tuple we have (tokenizer name, {kwargs}) ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer[0], **ans_tokenizer[1]) else: ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer) if isinstance(ans_model, str): ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model) if task == "e2e-qg": return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda) elif task == "question-generation": return task_class(model=model, tokenizer=tokenizer, ans_model=ans_model, ans_tokenizer=ans_tokenizer, qg_format=qg_format, use_cuda=use_cuda) else: return task_class(model=model, tokenizer=tokenizer, ans_model=model, ans_tokenizer=tokenizer, qg_format=qg_format, use_cuda=use_cuda)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( data_args.tgt_lang) if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, model.config.decoder_start_token_id, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics