def test_trainer_iterable_dataset(self): MODEL_ID = "sshleifer/tiny-distilbert-base-cased" model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID) train_dataset = SampleIterableDataset(PATH_SAMPLE_TEXT) training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset) loader = trainer.get_train_dataloader() self.assertIsInstance(loader, torch.utils.data.DataLoader)
def test_data_is_not_parallelized_when_model_is_parallel(self): model = RegressionModel() # Make the Trainer believe it's a parallelized model model.is_parallelizable = True model.model_parallel = True args = TrainingArguments("./regression", per_device_train_batch_size=16, per_device_eval_batch_size=16) trainer = Trainer(model, args, train_dataset=RegressionDataset(), eval_dataset=RegressionDataset()) # Check the Trainer was fooled self.assertTrue(trainer.is_model_parallel) self.assertEqual(trainer.args.n_gpu, 1) # The batch size of the training and evaluation dataloaders should be 16, not 16 * n_gpu self.assertEqual(trainer.get_train_dataloader().batch_size, 16) self.assertEqual(len(trainer.get_train_dataloader()), 64 // 16) self.assertEqual(trainer.get_eval_dataloader().batch_size, 16) self.assertEqual(len(trainer.get_eval_dataloader()), 64 // 16)
def test_training_iterable_dataset(self): config = RegressionModelConfig() model = RegressionPreTrainedModel(config) train_dataset = SampleIterableDataset() args = RegressionTrainingArguments(output_dir="./examples", max_steps=4) trainer = Trainer(model=model, args=args, train_dataset=train_dataset) trainer.train() self.assertEqual(trainer.state.global_step, 4) loader = trainer.get_train_dataloader() self.assertIsInstance(loader, torch.utils.data.DataLoader) self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler)
def test_trainer_iterable_dataset(self): # Simulate Language Modeling with an IterableDataset, with no __len__ method # Pick-up a tiny model, so it works on CPU # See Issue #5990: https://github.com/huggingface/transformers/issues/5990 MODEL_ID = "sshleifer/tiny-distilbert-base-cased" model = AutoModelForMaskedLM.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) train_dataset = SampleIterableDataset(file_path=PATH_SAMPLE_TEXT, tokenizer=tokenizer) training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator) trainer.train() loader = trainer.get_train_dataloader() self.assertIsInstance(loader, torch.utils.data.DataLoader) self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) # Exception if giving iterable dataset and no max_steps with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) _ = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator) # Exception if eval_dataset is iterable in __init__ with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) _ = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=train_dataset, data_collator=data_collator, ) # Exception if predicting with iterable dataset with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, data_collator=data_collator) trainer.predict(train_dataset) # Exception if evaluating with iterable dataset with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, data_collator=data_collator) trainer.evaluate(train_dataset)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. # **YD** force the load_dataset performs locally raise ValueError('MUST use local dataset!') # datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.extension_file print('extension', extension) print('data_files', data_files) datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features elif training_args.do_eval: column_names = datasets["validation"].column_names features = datasets["validation"].features elif training_args.do_predict: column_names = datasets["test"].column_names features = datasets["test"].features else: raise ValueError('must do_train/do_eval/do_predict') text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_offsets_mapping=True, ) offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] for label, offset_mapping in zip(examples[label_column_name], offset_mappings): label_index = 0 current_label = -100 label_ids = [] for offset in offset_mapping: # We set the label for the first token of each word. Special characters will have an offset of (0, 0) # so the test ignores them. if offset[0] == 0 and offset[1] != 0: current_label = label_to_id[label[label_index]] label_index += 1 label_ids.append(current_label) # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. elif offset[0] == 0 and offset[1] == 0: label_ids.append(-100) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( current_label if data_args.label_all_tokens else -100) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) print(tokenized_datasets['train'][0]) train_dataset = tokenized_datasets['train'] tmp_columns = ['attention_mask', 'input_ids', 'labels', 'token_type_ids'] train_dataset.set_format(type=train_dataset.format["type"], columns=tmp_columns) data_loader = DataLoader( tokenized_datasets['train'], batch_size=8, # sampler=RandomSampler(tokenized_datasets['train']), sampler=None, collate_fn=data_collator, drop_last=False, num_workers=0, ) for i, input in enumerate(data_loader): print('i', i) print(input) sys.exit() # Metrics def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) train_dataloader = trainer.get_train_dataloader() print('train_dataloader', list(train_dataloader)[0]) print('train_dataloader: dataset', train_dataloader.dataset.column_names) sys.exit() # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: logger.info("*** Predict ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
def main(config): # get relevant tweets my_tweets = dl_tweets(config.handle) curated_tweets = [fix_text(tweet) for tweet in my_tweets] clean_tweets = [clean_tweet(tweet) for tweet in curated_tweets] cool_tweets = [tweet for tweet in clean_tweets if not boring_tweet(tweet)] # split train/validation sets random.shuffle(cool_tweets) split_train_valid = 0.9 train_size = int(split_train_valid * len(cool_tweets)) valid_size = len(cool_tweets) - train_size train_dataset, valid_dataset = torch.utils.data.random_split( cool_tweets, [train_size, valid_size]) # make data files with open('data_{}_train.txt'.format(config.handle), 'w') as f: data = make_dataset(train_dataset, config.epochs) f.write(data) with open('data_{}_valid.txt'.format(config.handle), 'w') as f: data = make_dataset(valid_dataset, 1) f.write(data) # Set up training parameters tokenizer = AutoTokenizer.from_pretrained('gpt2') model = AutoModelForCausalLM.from_pretrained('gpt2') block_size = tokenizer.max_len train_dataset = TextDataset(tokenizer=tokenizer, file_path=f'data_{config.handle}_train.txt', block_size=block_size, overwrite_cache=True) valid_dataset = TextDataset(tokenizer=tokenizer, file_path=f'data_{config.handle}_valid.txt', block_size=block_size, overwrite_cache=True) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) seed = random.randint(0, 2**32 - 1) training_args = TrainingArguments( output_dir=f'output/{config.handle}', overwrite_output_dir=True, do_train=True, do_eval=True, evaluate_during_training=True, num_train_epochs=1, per_device_train_batch_size=1, logging_steps=5, eval_steps=5, save_steps=0, learning_rate=config.learning_rate, gradient_accumulation_steps=config.gradient_accumulation_steps, seed=seed) os.environ['WANDB_WATCH'] = 'false' # used in Trainer trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=valid_dataset, prediction_loss_only=True) # Update lr scheduler train_dataloader = trainer.get_train_dataloader() num_train_steps = int( len(train_dataloader) // config.gradient_accumulation_steps) optimizer, _ = trainer.get_optimizers(num_train_steps) if config.lr_scheduler == 'constant': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=int(config.percent_warmup_steps * num_train_steps)) elif config.lr_scheduler == 'linear': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(config.percent_warmup_steps * num_train_steps), num_training_steps=num_train_steps) elif config.lr_scheduler == 'cosine': scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=int(config.percent_warmup_steps * num_train_steps), num_training_steps=num_train_steps) trainer.optimizers = (optimizer, scheduler) # Train & evaluate trainer.train() trainer.evaluate()