def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None ) -> DataLoader: """ Returns the evaluation :class:`~torch.utils.data.DataLoader`. Args: eval_dataset (:obj:`Dataset`, `optional`): If provided, will override `self.eval_dataset`. """ if eval_dataset is None and self.eval_dataset is None: raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset if is_torch_tpu_available(): sampler = SequentialDistributedSampler( eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(eval_dataset) else: sampler = SequentialSampler(eval_dataset) data_loader = DataLoader( eval_dataset, sampler=sampler if not isinstance(self.train_dataset, IterableDataset) else None, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: """ Returns the test :class:`~torch.utils.data.DataLoader`. Args: test_dataset (obj:`Dataset`): The test dataset to use. """ # We use the same batch_size as for eval. if is_torch_tpu_available(): sampler = SequentialDistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) elif self.args.local_rank != -1: sampler = SequentialDistributedSampler(test_dataset) else: sampler = SequentialSampler(test_dataset) data_loader = DataLoader( test_dataset, sampler=sampler if not isinstance(self.train_dataset, IterableDataset) else None, batch_size=self.args.eval_batch_size, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def _get_world_size(self, local_rank: int): world_size = 1 if is_torch_tpu_available(): raise NotImplementedError() # world_size = xm.xrt_world_size() elif local_rank != -1: # noinspection PyUnresolvedReferences world_size = torch.distributed.get_world_size() return max(1, world_size)
def _gather_and_numpify(self, tensors): """ Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before concatenating them to `gathered` """ if tensors is None: return if is_torch_tpu_available(): # tensors = nested_xla_mesh_reduce(tensors, name) raise NotImplementedError() elif self.local_rank != -1: tensors = distributed_concat(tensors) return nested_numpify(tensors)
def get_dataloader(self, dataset, batch_size): if dataset is None: raise ValueError("Trainer: requires a dataset.") if transformers.is_torch_tpu_available(): sampler = get_tpu_sampler(dataset) elif not isinstance(dataset, IterableDataset): sampler = (RandomSampler(dataset) if self.args.local_rank == -1 else DistributedSampler(dataset)) else: sampler = None data_loader = DataLoader( dataset, batch_size=batch_size, sampler=sampler, collate_fn=self.data_collator, ) return data_loader
def get_single_train_dataloader(self, task_name, train_dataset): """ Create a single-task data loader that also yields task names """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if False and is_torch_tpu_available(): train_sampler = get_tpu_sampler(train_dataset) else: train_sampler = (RandomSampler(train_dataset) if self.args.local_rank == -1 else DistributedSampler(train_dataset)) data_loader = DataLoaderWithTaskname( task_name=task_name, data_loader=DataLoader( train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler, collate_fn=self.data_collator.collate_batch, ), ) return data_loader
def get_train_dataloader(self) -> DataLoader: """ Returns the training :class:`~torch.utils.data.DataLoader`. """ if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") if is_torch_tpu_available(): train_sampler = get_tpu_sampler(self.train_dataset) else: train_sampler = (RandomSampler(self.train_dataset) if self.args.local_rank == -1 else DistributedSampler(self.train_dataset)) data_loader = DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, sampler=train_sampler if not isinstance(self.train_dataset, IterableDataset) else None, collate_fn=self.data_collator, drop_last=self.args.dataloader_drop_last, ) return data_loader
def train(self, model_path=None, dev_objective=None): """ Main training entry point. The training logic is directly borrowed from transformers.Trainer (version 3.0.2). Add early stopping. """ self.best_dir = None self.objective = -float("inf") self.dev_objective = dev_objective if dev_objective is not None else default_dev_objective # Data loading. train_dataloader = self.get_train_dataloader() num_update_steps_per_epoch = len( train_dataloader) // self.args.gradient_accumulation_steps if num_update_steps_per_epoch == 0: num_update_steps_per_epoch = 1 if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = self.args.max_steps // num_update_steps_per_epoch + int( self.args.max_steps % num_update_steps_per_epoch > 0) else: t_total = int( len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs self.create_optimizer_and_scheduler(num_training_steps=t_total) optimizer = self.optimizer scheduler = self.lr_scheduler # Check if saved optimizer or scheduler states exist if (model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device)) scheduler.load_state_dict( torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16 and _use_apex: if not transformers.is_apex_available(): raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize( model, optimizer, opt_level=self.args.fp16_opt_level) # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) # Train if transformers.is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size( ) else: total_train_batch_size = (self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1)) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // ( len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info( " Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = torch.tensor(0.0).to(self.args.device) logging_loss_scalar = 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master()) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance( train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if transformers.is_torch_tpu_available(): parallel_loader = pl.ParallelLoader( train_dataloader, [self.args.device]).per_device_loader(self.args.device) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self.training_step(model, inputs) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator)): if self.args.fp16 and _use_native_amp: self.scaler.unscale_(optimizer) norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.args.max_grad_norm) elif self.args.fp16: norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), self.args.max_grad_norm) else: norm = torch.nn.utils.clip_grad_norm_( model.parameters(), self.args.max_grad_norm) if transformers.is_torch_tpu_available(): xm.optimizer_step(optimizer) elif self.args.fp16 and _use_native_amp: self.scaler.step(optimizer) self.scaler.update() else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or (self.global_step == 1 and self.args.logging_first_step): logs = {} tr_loss_scalar = tr_loss.item() logs["loss"] = (tr_loss_scalar - logging_loss_scalar ) / self.args.logging_steps logs["norm"] = norm.item() # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0]) logging_loss_scalar = tr_loss_scalar self.log(logs) # ---------------------------------------------------------------------- # BEGIN CHANGES. # ---------------------------------------------------------------------- metrics = None if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: output = self.evaluate() metrics = output.metrics objective = self.dev_objective(metrics) if objective > self.objective: logger.info( "Best dev result: {}".format(objective)) self.objective = objective self.save_model(self.args.output_dir) # ---------------------------------------------------------------------- # END CHANGES. # ---------------------------------------------------------------------- if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug or self.args.debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info( "\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n" ) return TrainOutput(self.global_step, tr_loss / self.global_step), self.objective
def train(self, model_name, batch_size, sequence_length, trace_memory=False): try: config = self.config_dict[model_name] if self.args.torchscript: config.torchscript = True model = MODEL_WITH_LM_HEAD_MAPPING[config.__class__](config) model.to(self.args.device) model.train() # encoder-decoder has vocab size saved differently vocab_size = config.vocab_size if hasattr( config, "vocab_size") else config.encoder.vocab_size input_ids = torch.randint(vocab_size, (batch_size, sequence_length), dtype=torch.long, device=self.args.device) if self.args.torchscript: raise NotImplementedError( "Training for torchscript is currently not implemented") else: train_model = model def compute_loss_and_backprob_encoder(): loss = train_model(input_ids, labels=input_ids)[0] loss.backward() train_model.zero_grad() def compute_loss_and_backprob_encoder_decoder(): loss = train_model(input_ids, decoder_input_ids=input_ids, labels=input_ids)[0] loss.backward() train_model.zero_grad() _train = (compute_loss_and_backprob_encoder_decoder if config.is_encoder_decoder else compute_loss_and_backprob_encoder) if trace_memory is True: if self.args.trace_memory_line_by_line: trace = start_memory_tracing("transformers") if self.args.n_gpu > 0: # gpu # clear gpu cache torch.cuda.empty_cache() if hasattr(torch.cuda, "max_memory_reserved"): torch.cuda.reset_peak_memory_stats() else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) torch.cuda.reset_max_memory_cached() # calculate loss and do backpropagation _train() elif not self.args.no_tpu and is_torch_tpu_available(): # tpu raise NotImplementedError( "Memory Benchmarking is currently not implemented for TPU. Please disable memory benchmarking with `args.no_memory=True`" ) else: # cpu memory_bytes = measure_peak_memory_cpu(_train) memory = Memory(memory_bytes) if isinstance( memory_bytes, int) else memory_bytes if self.args.trace_memory_line_by_line: summary = stop_memory_tracing(trace) else: summary = None if self.args.n_gpu > 0: # gpu if hasattr(torch.cuda, "max_memory_reserved"): memory = Memory(torch.cuda.max_memory_reserved()) else: logger.info( "Please consider updating PyTorch to version 1.4 to get more accuracy on GPU memory usage" ) memory = Memory(torch.cuda.max_memory_cached()) memory = Memory(torch.cuda.max_memory_reserved()) return memory, summary else: if (not self.args.no_tpu and is_torch_tpu_available()) or self.args.torchscript: # run additional 10 times to stabilize compilation for tpu and torchscript logger.info( "Do inference on TPU or torchscript. Running model 5 times to stabilize compilation" ) timeit.repeat( _train, repeat=1, number=5, ) # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average runtimes = timeit.repeat( _train, repeat=self.args.repeat, number=10, ) if not self.args.no_tpu and is_torch_tpu_available( ) and self.args.tpu_print_metrics: import torch_xla.debug.metrics as met self.print_fn(met.metrics_report()) return min(runtimes) / 10.0 except RuntimeError as e: self.print_fn("Doesn't fit on GPU. {}".format(e)) if trace_memory: return "N/A", None else: return "N/A"
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ A subclass of `Trainer` specific to Question-Answering tasks """ from transformers import Trainer, is_torch_tpu_available from transformers.trainer_utils import PredictionOutput if is_torch_tpu_available(check_device=False): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met class QuestionAnsweringTrainer(Trainer): def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): super().__init__(*args, **kwargs) self.eval_examples = eval_examples self.post_process_function = post_process_function def evaluate(self,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_mlm", model_args, data_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples[text_column_name] = [ line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() ] return tokenizer( examples[text_column_name], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= max_seq_length: total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): # Depending on the model and config, logits may contain extra tensors, # like past_key_values, but logits always come first logits = logits[0] return logits.argmax(dim=-1) metric = load_metric("accuracy") def compute_metrics(eval_preds): preds, labels = eval_preds # preds have the same shape as the labels, after the argmax(-1) has been calculated # by preprocess_logits_for_metrics labels = labels.reshape(-1) preds = preds.reshape(-1) mask = labels != -100 labels = labels[mask] preds = preds[mask] return metric.compute(predictions=preds, references=labels) # Data collator # This one will take care of randomly masking the tokens. pad_to_multiple_of_8 = data_args.line_by_line and training_args.fp16 and not data_args.pad_to_max_length data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability, pad_to_multiple_of=8 if pad_to_multiple_of_8 else None, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "fill-mask" } if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Model parameters %s", model_args) logger.info("Data parameters %s", data_args) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = BertConfig() logger.warning( "You are instantiating a new config instance from scratch.") config.loss_fct = model_args.loss_fct if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagGeneration.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir) else: logger.info("Training new model from scratch") model = BertForTagGeneration.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) neptune_project_name = 'junmokang/bertinsta' neptune_experiment_name = 'bertinsta-generation' if not training_args.do_eval: if is_torch_tpu_available(): if xm.get_ordinal() == 0: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) else: neptune.init(neptune_project_name) neptune.create_experiment(name=neptune_project_name, params=training_args.__dict__) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer, loss_fct=model_args.loss_fct) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size, loss_fct=model_args.loss_fct) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, neptune=neptune, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} grouping_results = {} # interaction_matrix = np.zeros((6, 6)) # feature interaction beam_width = 1 top_k = 10 # tag to contexts mapping context_list = [ 'emotion', 'mood', 'location', 'time', 'object', 'activity', 'event', 'others' ] context2ids = {c: [] for c in context_list} if data_args.tag2contexts: with open(data_args.tag2contexts) as f: tag2contexts = json.load(f) for tag, contexts in tag2contexts.items(): for c in contexts: context2ids[c].append(tag) for c in context_list: context2ids[c] = tokenizer.convert_tokens_to_ids( context2ids[c]) for eid, example in enumerate(tqdm(dataloader, desc=description)): generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device) # generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, None, interaction_matrix) # feature interaction results[example['pid']] = generated_tags grouping_results[example['pid']] = {} grouping_results[example['pid']]['all'] = generated_tags # print('all:', str(generated_tags)) # diverse generation (according to context) if data_args.tag2contexts: for context in context_list: generated_tags = beam_decode(beam_width, top_k, model, example, tokenizer, data_args.block_size, training_args.device, context2ids[context]) grouping_results[example['pid']][context] = generated_tags # print(context, ':', str(generated_tags)) # with np.printoptions(precision=2, suppress=True): # feature interaction # print(interaction_matrix) # print(interaction_matrix.sum(1)) # print(interaction_matrix / interaction_matrix.sum(1)) results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f) grouping_results_save_path = os.path.join(training_args.output_dir, 'grouping_results.json') with open(grouping_results_save_path, 'w') as f: logger.info("saved grouping_results.json into %s", training_args.output_dir) json.dump(grouping_results, f)
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "eval", ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if not isinstance(dataloader.dataset, collections.abc.Sized): raise ValueError("dataset must implement __len__") prediction_loss_only = ( prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only ) if self.args.deepspeed and not self.args.do_train: # no harm, but flagging to the user that deepspeed config is ignored for eval # flagging only for when --do_train wasn't passed as only then it's redundant logger.info("Detected the deepspeed argument but it will not be used for evaluation") model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways # if full fp16 is wanted on eval and this ``evaluation`` or ``predict`` isn't called while # ``train`` is running, half it first and then put on device batch_size = dataloader.batch_size num_examples = self.num_examples(dataloader) logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", batch_size) losses_host: torch.Tensor = None preds_host: Union[torch.Tensor, List[torch.Tensor]] = None labels_host: Union[torch.Tensor, List[torch.Tensor]] = None gumbel_host: Union[torch.Tensor, List[torch.Tensor]] = None sentence_labels_host: Union[torch.Tensor, List[torch.Tensor]] = None sentence_indicator_host: Union[torch.Tensor, List[torch.Tensor]] = None world_size = 1 if is_torch_tpu_available(): world_size = xm.xrt_world_size() elif self.args.local_rank != -1: world_size = dist.get_world_size() world_size = max(1, world_size) eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) if not prediction_loss_only: preds_gatherer = DistributedTensorGatherer(world_size, num_examples) labels_gatherer = DistributedTensorGatherer(world_size, num_examples) gumbel_gatherer = DistributedTensorGatherer(world_size, num_examples) sentence_labels_gatherer = DistributedTensorGatherer(world_size, num_examples) sentence_indicator_gatherer = DistributedTensorGatherer(world_size, num_examples) model.eval() if self.args.past_index >= 0: self._past = None self.callback_handler.eval_dataloader = dataloader for step, inputs in enumerate(dataloader): loss, logits, labels, gumbel_output, sentence_labels, sentence_indicator = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) if loss is not None: losses = loss.repeat(batch_size) losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) if logits is not None: preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) if labels is not None: labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) if gumbel_output is not None: gumbel_host = gumbel_output if gumbel_host is None else nested_concat(gumbel_host, gumbel_output, padding_index=-1) if sentence_labels is not None: sentence_labels_host = sentence_labels if sentence_labels_host is None else nested_concat(sentence_labels_host, sentence_labels, padding_index=-1) if sentence_indicator is not None: sentence_indicator_host = sentence_indicator if sentence_indicator_host is None else nested_concat(sentence_indicator_host, sentence_indicator, padding_index=-100) self.control = self.callback_handler.on_prediction_step(self.args, self.state, self.control) # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. if self.args.eval_accumulation_steps is not None and (step + 1) % self.args.eval_accumulation_steps == 0: eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) gumbel_gatherer.add_arrays(self._gather_and_numpify(gumbel_host, "eval_gumbel_output")) sentence_labels_gatherer.add_arrays(self._gather_and_numpify(sentence_labels_host, "eval_sentence_idxs")) sentence_indicator_gatherer.add_arrays(self._gather_and_numpify(sentence_indicator_host, "eval_sentence_indicator")) # Set back to None to begin a new accumulation losses_host, preds_host, labels_host, gumbel_host, sentence_labels_host, sentence_indicator_host = None, None, None, None, None, None if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") # Gather all remaining tensors and put them back on the CPU eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) if not prediction_loss_only: preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) gumbel_gatherer.add_arrays(self._gather_and_numpify(gumbel_host, "eval_gumbel_output")) sentence_labels_gatherer.add_arrays(self._gather_and_numpify(sentence_labels_host, "eval_sentence_idxs")) sentence_indicator_gatherer.add_arrays(self._gather_and_numpify(sentence_indicator_host, "eval_sentence_indicator")) eval_loss = eval_losses_gatherer.finalize() preds = preds_gatherer.finalize() if not prediction_loss_only else None label_ids = labels_gatherer.finalize() if not prediction_loss_only else None gumbel_outputs = gumbel_gatherer.finalize() if not prediction_loss_only else None sentence_idxs = sentence_labels_gatherer.finalize() if not prediction_loss_only else None sentence_indicators = sentence_indicator_gatherer.finalize() if not prediction_loss_only else None print(sentence_idxs, 'test') if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(preds, label_ids, gumbel_outputs, sentence_idxs, sentence_indicators) else: metrics = {} if eval_loss is not None: metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() # Prefix all keys with metric_key_prefix + '_' for key in list(metrics.keys()): if not key.startswith(f"{metric_key_prefix}_"): metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics)
def prediction_loop( self, dataloader: DataLoader, description: str, prediction_loss_only: Optional[bool] = None, use_tqdm: Optional[bool] = True, reduce_other_outputs:Callable[[Tuple[torch.tensor]], Any] = None, ) -> PredictionOutput: """ Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`. Works both with or without labels. """ if hasattr(self, "_prediction_loop"): warnings.warn( "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.", FutureWarning, ) return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only) prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only model = self.model # multi-gpu eval if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) else: model = self.model # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", self.num_examples(dataloader)) logger.info(" Batch size = %d", batch_size) eval_losses: List[float] = [] preds: torch.Tensor = None label_ids: torch.Tensor = None other_outputs: Tuple[torch.Tensor] = None model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader(dataloader, [self.args.device]).per_device_loader(self.args.device) if self.args.past_index >= 0: self._past = None data_iterators = tqdm(dataloader, desc=description) if use_tqdm else dataloader reduce_other_outputs = reduce_other_outputs if reduce_other_outputs is not None else self.reduce_other_outputs for inputs in data_iterators: loss, logits, labels, other_outputs_ = self.prediction_step(model, inputs, prediction_loss_only) if loss is not None: eval_losses.append(loss) if logits is not None: preds = logits if preds is None else torch.cat((preds, logits), dim=0) if labels is not None: label_ids = labels if label_ids is None else torch.cat((label_ids, labels), dim=0) if other_outputs_ is not None: #print(list(o.size() for o in other_outputs_)) #if other_outputs is not None: # print(list(o.size() for o in others)) if reduce_other_outputs is not None: other_outputs = other_outputs_ if other_outputs is None else tuple( reduce_other_outputs(output, output_) for output, output_ in zip(other_outputs, other_outputs_) ) # 不加tuple只用()就是generator if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of the evaluation loop delattr(self, "_past") if self.args.local_rank != -1: # In distributed mode, concatenate all results from all nodes: if preds is not None: preds = self.distributed_concat(preds, num_total_examples=self.num_examples(dataloader)) if label_ids is not None: label_ids = self.distributed_concat(label_ids, num_total_examples=self.num_examples(dataloader)) #if other_outputs is not None: # [TODO] maybe error!!! 不熟悉distributed训练 # other_outputs = tuple(self.distributed_concat(o, num_total_examples=self.num_examples(dataloader)) for o in other_outputs) elif is_torch_tpu_available(): # tpu-comment: Get all predictions and labels from all worker shards of eval dataset if preds is not None: preds = xm.mesh_reduce("eval_preds", preds, torch.cat) if label_ids is not None: label_ids = xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) #if others is not None: # [TODO] maybe error!!! 不熟悉tpu训练, 这里就不考虑了 #others = tuple(xm.mesh_reduce("eval_label_ids", label_ids, torch.cat) # Finally, turn the aggregated tensors into numpy arrays. if preds is not None: preds = preds.cpu().numpy() if label_ids is not None: label_ids = label_ids.cpu().numpy() if other_outputs is not None: other_outputs = other_outputs # 假设一切都在self.reduce_other_outputs中处理好了 if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} if len(eval_losses) > 0: metrics[f"{description}_loss"] = np.mean(eval_losses) # Prefix all keys with eval_ #for key in list(metrics.keys()): # if not key.startswith("eval_"): # metrics[f"eval_{key}"] = metrics.pop(key) for key in list(metrics.keys()): if not key.startswith(description): tqdm_prefix = "" new_key = key if key[0] == "_": tqdm_prefix = "_" new_key = key[1:] metrics[tqdm_prefix+description+"_"+new_key] = metrics.pop(key) return PredictionOutput(predictions=preds, label_ids=label_ids, metrics=metrics, other_outputs=other_outputs)
def train(self, model_path: Optional[str] = None): """ Main training entry point. Args: model_path (:obj:`str`, `optional`): Local path to the model if the model to train has been instantiated from a local path. If present, training will resume from the optimizer/scheduler states loaded here. """ if not self.call_once_called and self.auto_tb_writer: self.call_once() self.call_once_called = True train_dataloader = self.get_train_dataloader() if self.args.max_steps > 0: t_total = self.args.max_steps num_train_epochs = ( self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 ) else: t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs) num_train_epochs = self.args.num_train_epochs optimizer, scheduler = self.get_optimizers(num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if ( model_path is not None and os.path.isfile(os.path.join(model_path, "optimizer.pt")) and os.path.isfile(os.path.join(model_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(model_path, "optimizer.pt"), map_location=self.args.device) ) scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt"))) model = self.model if self.args.fp16: if not is_apex_available(): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=self.args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if self.args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, find_unused_parameters=True, ) if self.tb_writer is not None: self.tb_writer.add_text("args", self.args.to_json_string()) self.tb_writer.add_hparams(self.args.to_sanitized_dict(), metric_dict={}) # Train! if is_torch_tpu_available(): total_train_batch_size = self.args.train_batch_size * xm.xrt_world_size() else: total_train_batch_size = ( self.args.train_batch_size * self.args.gradient_accumulation_steps * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1) ) logger.info("***** Running training *****") logger.info(" Num examples = %d", self.num_examples(train_dataloader)) logger.info(" Num Epochs = %d", num_train_epochs) logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", total_train_batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) self.global_step = 0 self.epoch = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if model_path is not None: # set global_step to global_step of last saved checkpoint from model path try: self.global_step = int(model_path.split("-")[-1].split("/")[0]) epochs_trained = self.global_step // (len(train_dataloader) // self.args.gradient_accumulation_steps) steps_trained_in_current_epoch = self.global_step % ( len(train_dataloader) // self.args.gradient_accumulation_steps ) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", self.global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: self.global_step = 0 logger.info(" Starting fine-tuning.") tr_loss = 0.0 logging_loss = 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(num_train_epochs), desc="Epoch", disable=not self.is_local_master() ) for epoch in train_iterator: if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler): train_dataloader.sampler.set_epoch(epoch) if is_torch_tpu_available(): parallel_loader = pl.ParallelLoader(train_dataloader, [self.args.device]).per_device_loader( self.args.device ) epoch_iterator = tqdm(parallel_loader, desc="Iteration", disable=not self.is_local_master()) else: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=not self.is_local_master()) # Reset the past mems state at the beginning of each epoch if necessary. if self.args.past_index >= 0: self._past = None tqdm_dict = {} for step, inputs in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue tr_loss += self.training_step(model, inputs, optimizer) if (step + 1) % self.args.gradient_accumulation_steps == 0 or ( # last step in epoch but step is always smaller than gradient_accumulation_steps len(epoch_iterator) <= self.args.gradient_accumulation_steps and (step + 1) == len(epoch_iterator) ): if self.args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.max_grad_norm) if is_torch_tpu_available(): xm.optimizer_step(optimizer) else: optimizer.step() scheduler.step() model.zero_grad() self.global_step += 1 self.epoch = epoch + (step + 1) / len(epoch_iterator) if (self.args.logging_steps > 0 and self.global_step % self.args.logging_steps == 0) or ( self.global_step == 1 and self.args.logging_first_step ): logs: Dict[str, float] = {} logs["loss"] = (tr_loss - logging_loss) / self.args.logging_steps # backward compatibility for pytorch schedulers logs["learning_rate"] = ( scheduler.get_last_lr()[0] if version.parse(torch.__version__) >= version.parse("1.4") else scheduler.get_lr()[0] ) logging_loss = tr_loss tqdm_dict['train_loss'] = logs["loss"] self.log(logs) if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: eval_metrics = self.evaluate(use_tqdm=self.use_tqdm_in_eval, log=True, description="eval") # also save the metrics for k,v in eval_metrics.items(): if k[0] == '_': tqdm_dict[k[1:]] = v if self.args.evaluate_trainset_during_training and self.global_step % self.args.eval_trainset_steps == 0: train_metrics = self.evaluate(eval_dataloader=self.eval_train_dataloader,use_tqdm=self.use_tqdm_in_eval, log=True, description="eval_train") for k,v in train_metrics.items(): if k[0] == '_': tqdm_dict[k[1:]] = v if self.args.save_steps > 0 and self.global_step % self.args.save_steps == 0: # In all cases (even distributed/parallel), self.model is always a reference # to the model we want to save. if hasattr(model, "module"): assert model.module is self.model else: assert model is self.model # Save model checkpoint output_dir = os.path.join(self.args.output_dir, self.exp_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.global_step}") self.save_model(output_dir) if self.is_world_master(): self._rotate_checkpoints() if is_torch_tpu_available(): xm.rendezvous("saving_optimizer_states") xm.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) xm.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) elif self.is_world_master(): torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) epoch_iterator.set_postfix(**tqdm_dict) train_iterator.set_postfix(**tqdm_dict) if self.args.max_steps > 0 and self.global_step > self.args.max_steps: epoch_iterator.close() break if self.args.max_steps > 0 and self.global_step > self.args.max_steps: train_iterator.close() break if self.args.tpu_metrics_debug or self.args.debug: # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) xm.master_print(met.metrics_report()) if self.tb_writer: self.tb_writer.close() log_dirs = os.listdir(self.log_dir) for name in log_dirs: if re.match("\d{10}.*", name): # unknown dir generated by tensorboardX shutil.rmtree(os.path.join(self.log_dir, name)) if self.compare_datasets: self.tb_writer_for_train.close() self.tb_writer_for_eval.close() if self.args.past_index and hasattr(self, "_past"): # Clean the state at the end of training delattr(self, "_past") logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") return TrainOutput(self.global_step, tr_loss / self.global_step)
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ A subclass of `Trainer` specific to Question-Answering tasks """ from transformers import Trainer, is_datasets_available, is_torch_tpu_available from transformers.trainer_utils import PredictionOutput if is_datasets_available(): import datasets if is_torch_tpu_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met class QuestionAnsweringTrainer(Trainer): def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs): super().__init__(*args, **kwargs) self.eval_examples = eval_examples self.post_process_function = post_process_function def evaluate(self,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = ( data_args.train_file.split(".")[-1] if data_args.train_file is not None else data_args.validation_file.split(".")[-1] ) if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) logger.info(f"New config: {config}") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: model = AutoModelForCausalLM.from_config(config) n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params") model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = raw_datasets["train"].column_names else: column_names = raw_datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") def tokenize_function(examples): with CaptureLogger(tok_logger) as cl: output = tokenizer(examples[text_column_name]) # clm input could be much much longer than block_size if "Token indices sequence length is longer than the" in cl.out: tok_logger.warning( "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model." ) return output with training_args.main_process_first(desc="dataset map tokenization"): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with training_args.main_process_first(desc="grouping texts together"): lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = lm_datasets["train"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = lm_datasets["validation"] if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) def preprocess_logits_for_metrics(logits, labels): if isinstance(logits, tuple): # Depending on the model and config, logits may contain extra tensors, # like past_key_values, but logits always come first logits = logits[0] return logits.argmax(dim=-1) metric = load_metric("accuracy") def compute_metrics(eval_preds): preds, labels = eval_preds # preds have the same shape as the labels, after the argmax(-1) has been calculated # by preprocess_logits_for_metrics but we need to shift the labels labels = labels[:, 1:].reshape(-1) preds = preds[:, :-1].reshape(-1) return metric.compute(predictions=preds, references=labels) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, # Data collator will default to DataCollatorWithPadding, so we change it. data_collator=default_data_collator, compute_metrics=compute_metrics if training_args.do_eval and not is_torch_tpu_available() else None, preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval and not is_torch_tpu_available() else None, ) # Training if training_args.do_train: checkpoint = None if training_args.resume_from_checkpoint is not None: checkpoint = training_args.resume_from_checkpoint elif last_checkpoint is not None: checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: trainer.create_model_card(**kwargs)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = BertConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = BertConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = BertForTagRankingLate.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = BertForTagRanking.from_config(config) # add vocab for special tokens and hashtags special_tokens = ['<img>', '<loc>', '<time>'] num_added_special_toks = tokenizer.add_tokens(special_tokens) print('We have added', num_added_special_toks, 'special tokens') tokenizer.img_token = '<img>' tokenizer.loc_token = '<loc>' tokenizer.time_token = '<time>' print(tokenizer.convert_tokens_to_ids(special_tokens)) assert tokenizer.img_token == '<img>' assert tokenizer.loc_token == '<loc>' assert tokenizer.time_token == '<time>' with open(data_args.tag_list) as f: tag_list = f.readlines() tag_list = ' '.join(tag_list).replace('\n', '').split() num_added_toks = tokenizer.add_tokens(tag_list) print('tag_list:', data_args.tag_list) print('We have added', num_added_toks, 'tokens for hashtags') print('total vocab_size:', len(tokenizer)) model.resize_token_embeddings(len(tokenizer)) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = DataCollatorForTagGeneration(config.vocab_size) training_args.per_device_eval_batch_size = 1 # force eval_batch as 1 # Initialize our Trainer trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") dataloader = trainer.get_eval_dataloader(eval_dataset) # multi-gpu eval if training_args.n_gpu > 1: model = torch.nn.DataParallel(model) description = "Evaluation" batch_size = dataloader.batch_size logger.info("***** Running %s *****", description) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", batch_size) model.eval() if is_torch_tpu_available(): dataloader = pl.ParallelLoader( dataloader, [training_args.device]).per_device_loader(training_args.device) results = {} for eid, example in enumerate(tqdm(dataloader, desc=description)): feature = convert_example_to_feature(example, tokenizer, data_args.block_size) image_ids = torch.tensor([feature['image_ids']], dtype=torch.long).to(training_args.device) location_ids = torch.tensor([feature['location_ids']], dtype=torch.long).to( training_args.device) time_ids = torch.tensor([feature['time_ids']], dtype=torch.long).to(training_args.device) text_ids = torch.tensor([feature['text_ids']], dtype=torch.long).to(training_args.device) pid = feature['pid'] inputs = { 'image_ids': image_ids, 'location_ids': location_ids, 'time_ids': time_ids, 'text_ids': text_ids } with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] logit_for_cls = logits[0] orig_vocab_size = 30522 added_special_toks_size = 3 # <img>, <loc>, <time> logit_for_cls[:orig_vocab_size + added_special_toks_size] = -float('inf') probabilities = F.softmax(logit_for_cls, 0).detach().cpu() probs, predicted_indices = torch.topk(probabilities, k=10) predicted_tokens = tokenizer.convert_ids_to_tokens( predicted_indices) while pid in results: pid = pid + '_' results[pid] = predicted_tokens results_save_path = os.path.join(training_args.output_dir, 'results.json') with open(results_save_path, 'w') as f: logger.info("saved results.json into %s", training_args.output_dir) json.dump(results, f)