def load_args(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) elif len(sys.argv) == 1: # If we pass no args to the script then load args from .yaml with open(f'config/seq2seq/seq2seq_base.yaml', 'r') as f: all_args = yaml.load(f, Loader=yaml.FullLoader) # Also load user-specified args and override base args with open(f'config/seq2seq/train.yaml', 'r') as f: user_args = yaml.load(f, Loader=yaml.FullLoader) all_args.update(user_args) wandb = all_args.pop('wandb') model_args, data_args, training_args = parser.parse_dict(all_args) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if 'tmp' in training_args.output_dir: training_args.overwrite_output_dir = True wandb = False training_args.learning_rate = float(training_args.learning_rate) os.environ["WANDB_DISABLED"] = "" if wandb else "true" return model_args, data_args, training_args
def train( self, examples: List[InputExample], epochs=3, bath_size=16, seed=42, ): features, f_word_maps = TokenClassificationTask.parse_examples( examples, tokenizer=self.tokenizer, label2id=self.label2id, model_type=self.config.model_type, max_seq_length=self.max_seq_length, ignore_sub_tokens_labes=self.ignore_sub_tokens_labes, spliting_strategy=self.spliting_strategy, sentence_strategy=self.sentence_strategy, ) train_dataset = TokenClassificationDataset(features) parser = HfArgumentParser(TrainingArguments) training_args = parser.parse_dict({ "output_dir": self.output_dir, "num_train_epochs": epochs, "per_device_train_batch_size": bath_size, "seed": seed, "save_total_limit": 0, })[0] trainer = Trainer( model=self.model, args=training_args, train_dataset=train_dataset, compute_metrics=self.metric_function(), ) training_result = trainer.train() logger.debug(training_result) trainer.save_model() self.tokenizer.save_pretrained(self.output_dir) with open(f'{self.output_dir}/settings.json', 'w') as outfile: json.dump( { "labels": self.labels, "ignore_sub_tokens_labes": self.ignore_sub_tokens_labes, "spliting_strategy": self.spliting_strategy, "sentence_strategy": self.sentence_strategy, "prediction_strategy": self.prediction_strategy, }, outfile) self.model.to('cpu') return training_result
def test_parse_dict(self): parser = HfArgumentParser(BasicExample) args_dict = { "foo": 12, "bar": 3.14, "baz": "42", "flag": True, } parsed_args = parser.parse_dict(args_dict)[0] args = BasicExample(**args_dict) self.assertEqual(parsed_args, args)
def convert_to_prunable_checkpoint(checkpoint_folder, experiment): """ This loads a dense models weights and a prunable model of similar architecture (one with SparseWeightsBase layers), copies the weights of the former into the latter, and then saves a new checkpoint at `{checkpoint_folder}_prunable`. :param checkpoint_folder: path to dense checkpoint :param experiment: name of experiment config with a prunable architecture """ # We'll use `sparsity=0` to ensure it's dense but prunable model. exp_config = CONFIGS[experiment] exp_config["config_kwargs"]["sparsity"] = 0 exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] # Initialize prunable model and dense model. config = init_config(model_args) tokenizer = init_tokenizer(model_args) prunable_model = AutoModelForMaskedLM.from_config(config) prunable_model.resize_token_embeddings(len(tokenizer)) dense_model = AutoModelForMaskedLM.from_pretrained(checkpoint_folder) # Determine which parameters belong to SparseWeightsBase classes. sparse_params = filter_params(prunable_model, include_modules=[SparseWeightsBase]) sparse_dataptrs = [p.data_ptr() for p in sparse_params.values()] # Load the dense params into the prunable params. for n2, p2 in prunable_model.named_parameters(): # e.g. replace `linear.module.weight` with `linear.weight` when appropriate. if p2.data_ptr() in sparse_dataptrs: n1 = n2.replace(".module", "") else: n1 = n2 p1 = get_module_attr(dense_model, n1) p2.data[:] = p1 # Save the prunable model. new_folder_name = checkpoint_folder + "_prunable" prunable_model.save_pretrained(new_folder_name) print(f"Saved prunable model to:\n{new_folder_name}")
def train(self): """ 训练模型,必须实现此方法 :return: """ config = HyperParametersConfig(epochs=self.args.EPOCHS, batch_size=self.args.BATCH) parser = HfArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) # config_dict = HyperParametersConfig().__dict__ # print(config_dict) model_args, data_args, training_args = parser.parse_dict( config.__dict__) logger.info("Load pre-training model.") tokenizer = BertTokenizer.from_pretrained( model_args.model_name_or_path) model = CustomGPTGeneration.from_pretrained( model_args.model_name_or_path) # Get datasets logger.info("Loading dataset.") train_dataset = PsychologicalQADataset( data_args.dataset_path, tokenizer=tokenizer, max_sequence_len=data_args.max_sequence_len) logger.info("Initialize Trainer.") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=None, ) logger.info("Training start.") if training_args.do_train: trainer.train() trainer.save_model() if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir)
def __init__(self, param_dict): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. arguments = copy.deepcopy(self.ARGUMENTS) self.arguments_names = list(arguments.keys()) parser = HfArgumentParser(arguments.values()) parse_results = parser.parse_dict(param_dict) #, strict=True) assert self.arguments_names[0] == "model" assert self.arguments_names[1] == "data" assert self.arguments_names[2] == "training" # Explicitly affect args, to make IDE not flagging members as unknown self.model_args = parse_results[0] self.data_args = parse_results[1] self.training_args = parse_results[2] for i, (k, v) in enumerate(arguments.items()): if i < 3: continue setattr(self, k + "_args", parse_results[i])
def main(): cmd_parser = argparse.ArgumentParser() cmd_parser.add_argument("experiments", nargs="+", choices=list(CONFIGS.keys()), help="Available experiments") cmd_parser.add_argument("--local_rank", default=None, help="added by torch.distributed.launch") cmd_args = cmd_parser.parse_args() for experiment in cmd_args.experiments: config_dict = CONFIGS[experiment] local_rank = int(cmd_args.local_rank or -1) config_dict["local_rank"] = local_rank # See all possible arguments in transformers/training_args.py and ./run_args.py exp_parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, CustomTrainingArguments)) model_args, data_args, training_args = exp_parser.parse_dict( config_dict) # Overrides default behavior of TrainingArguments of setting run name # equal to output_dir when not available if training_args.run_name == training_args.output_dir: training_args.run_name = experiment # Run name (or experiment name) is added to the output_dir training_args.output_dir = os.path.join(training_args.output_dir, training_args.run_name) # Initialize wandb now to include the logs that follow. # For now, only support early wandb logging when running one experiment. distributed_initialized = torch.distributed.is_initialized() rank = -1 if not distributed_initialized else torch.distributed.get_rank( ) if is_wandb_available() and len(cmd_args.experiments) == 1: CustomWandbCallback.early_init(training_args, rank) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) logging.warning(f"Loading from checkpoint: {last_checkpoint} ") if (last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and " "is not empty. Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logging.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To " "avoid this behavior, change the `--output_dir` or add " "`--overwrite_output_dir` to train from scratch.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], level=(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)) # Log config. logging.info(f"Running with config:\n{pformat(config_dict, indent=4)}") # Log on each process the small summary: logging.warning( f"Process rank: {training_args.local_rank}, " f"device: {training_args.device}, n_gpu: {training_args.n_gpu} " f"distributed training: {bool(training_args.local_rank != -1)}, " f"16-bits training: {training_args.fp16}") # Set the verbosity to info of the Transformers logging (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logging.info("Training/evaluation parameters %s", training_args) logging.info("Model parameters: %s", model_args) logging.info("Data parameters: %s", data_args) # Set seed before initializing model. set_seed(training_args.seed) logging.info(f"Seed to reproduce: {training_args.seed}") if model_args.finetuning: run_finetuning_multiple_tasks(model_args, data_args, training_args, last_checkpoint=last_checkpoint) else: run_pretraining(model_args, data_args, training_args, last_checkpoint=last_checkpoint) # destroy process group before launching another experiment if cmd_args.local_rank: torch.distributed.destroy_process_group()
def run(args=None, training_args=None): if args is not None and training_args is not None: parser = HfArgumentParser((TrainScriptArguments)) args = parser.parse_dict(args)[0] parser = HfArgumentParser((TrainingArguments)) training_args = parser.parse_dict(training_args)[0] else: parser = HfArgumentParser((TrainScriptArguments, TrainingArguments)) args, training_args = parser.parse_args_into_dataclasses() # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) set_seed(training_args.seed) # Setup wandb os.environ["WANDB_PROJECT"] = args.wandb_project if args.is_dryrun: os.environ["WANDB_MODE"] = "dryrun" logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) tokenizer_name = args.model_name if args.tokenizer_name is None else args.tokenizer_name # TODO: Fix this hard-coded stuff if args.model_name == "bert-base-cased": model = EncoderDecoderModel.from_encoder_decoder_pretrained( args.model_name, args.model_name) else: model = NAME_TO_MODEL[args.model_name].from_pretrained(args.model_name) tokenizer = NAME_TO_TOK[tokenizer_name].from_pretrained(tokenizer_name) logger.info(f"Path {args.train_data_path}") train_data = torch.load( args.train_data_path) if training_args.do_train else None test_data = torch.load( args.test_data_path) if training_args.do_eval else None # Resizes the train set to args.data_size percentage logger.info(f"Train-data pre-size: {train_data.num_rows}") if args.absolute_data_size: num_rows = args.absolute_data_size else: num_rows = int(train_data.num_rows * (args.data_size / 100)) # The select takes time even when selecting all rows. Do this check first. if args.absolute_data_size or args.data_size < 100: # Shuffle train_data before re-sizing. Controlling with seed # Make sure to override the cache file as it does not care about data size train_data = train_data.shuffle(seed=training_args.seed, keep_in_memory=True, load_from_cache_file=False) train_data = train_data.select(torch.arange(0, num_rows), keep_in_memory=True, load_from_cache_file=False) # For some reason, Dataset.select() and Dataset.shuffle() resets format fields = ["source_ids", "target_ids", "attention_mask"] train_data.set_format(type="torch", columns=fields) test_data.set_format(type="torch", columns=fields) logger.info(f"Train-data size: {train_data.num_rows}") collator = DataCollator(tokenizer=tokenizer, is_training=training_args.do_train, tpu=training_args.tpu_num_cores is not None) trainer = Trainer(model=model, args=training_args, train_dataset=train_data, eval_dataset=test_data, data_collator=collator, prediction_loss_only=True) if training_args.do_train: trainer.train() trainer.save_model()
def main(): # See all possible arguments in src/transformers/training_args.py or by passing # the --help flag to this script. We now keep distinct sets of args, for a cleaner # separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments, AdapterTrainingArguments)) # For running on multiple gpus with torch.distributed.launch, it adds a local_rank paramter, to allow the parser # still use the config file, we add the local_rank to the config file. if len(sys.argv) == 3 and sys.argv[1].startswith( "--local_rank") and sys.argv[2].endswith(".json"): args_dict = json.loads(Path(sys.argv[2]).read_text()) args_dict.update({'local_rank': int(sys.argv[1].split('=')[-1])}) model_args, data_args, training_args, adapter_args = parser.parse_dict( args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args, adapter_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = T5Config.from_pretrained( model_args.config_name if model_args.config_name else \ model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout", "fixed_length_emb", "encoder_projection", "encoder_pooling", "projection_length", "only_projection_bottleneck", "concat_projection_token", "train_adapters") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) # Gets the adapter config and updates the specified parameters. if training_args.train_adapters: adapter_config = AutoAdapterConfig.get( adapter_args.adapter_config_name) adapter_config.input_dim = config.d_model adapter_config.tasks = data_args.tasks extra_adapter_params = ( "task_embedding_dir", "task_embedding_dim", "add_layer_norm_before_adapter", "add_layer_norm_after_adapter", "reduction_factor", "hidden_dim", "non_linearity", "train_task_embeddings", "projected_task_embedding_dim", "add_adapters_in_decoder", "add_adapter_in_feed_forward", "add_adapter_in_self_attention", "task_hidden_dim", "conditional_layer_norm", "one_layer_adapter_hyper_net", "adapter_hyper_net_with_bias", "one_layer_adapter_hyper_net_with_linear", "parametric_task_embedding", "conditional_layer_norm_for_T5", "train_adapters_blocks", "remove_original_layer_norms", "unique_hyper_net", "unique_hyper_net_layer_norm") for p in extra_adapter_params: if hasattr(adapter_args, p) and hasattr(adapter_config, p): setattr(adapter_config, p, getattr(adapter_args, p)) else: logger.warning( f"({adapter_config.__class__.__name__}) doesn't have a `{p}` attribute" ) adapter_config.device = training_args.device else: adapter_config = None tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else \ model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) if model_args.not_load_t5_checkpoint: model = T5ForConditionalGeneration(config=config, adapter_config=adapter_config) else: model = T5ForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, adapter_config=adapter_config) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # freezing the parameters. if training_args.do_train: freezing_params(model, training_args, model_args, adapter_args) if training_args.print_num_parameters: logger.info(model) for name, param in model.named_parameters(): if param.requires_grad: logger.info("Parameter name %s", name) total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info("Total trainable parameters %s", total_trainable_params) # Gets the training/test/validation datasets. dataset_class = AutoTask if training_args.do_train: train_datasets = [ dataset_class.get(task, seed=data_args.data_seed).get_dataset( split="train", n_obs=data_args.n_train, add_prefix=False if training_args.train_adapters else True) for task in data_args.tasks ] dataset_sizes = [ len(train_dataset) for train_dataset in train_datasets ] train_dataset = datasets.concatenate_datasets(train_datasets) training_args.remove_unused_columns = False eval_datasets = ({ task: dataset_class.get(task, seed=data_args.data_seed).get_dataset( split="validation", n_obs=data_args.n_val, add_prefix=False if training_args.train_adapters else True, split_validation_test=training_args.split_validation_test) for task in data_args.eval_tasks } if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = ({ task: dataset_class.get(task, seed=data_args.data_seed).get_dataset( split="test", n_obs=data_args.n_test, add_prefix=False if training_args.train_adapters else True, split_validation_test=training_args.split_validation_test) for task in data_args.eval_tasks } if training_args.do_test else None) # Defines the metrics for evaluation. compute_metrics_fn = (build_compute_metrics_fn(data_args.eval_tasks, tokenizer) if training_args.predict_with_generate else None) # Defines the trainer. trainer = T5Trainer( model=model, config=config, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_datasets, data_collator=TaskCollator(tokenizer, data_args, tpu_num_cores=training_args.tpu_num_cores), compute_metrics=None, multi_task_compute_metrics=compute_metrics_fn, data_args=data_args, dataset_sizes=dataset_sizes if training_args.do_train else None, callbacks=[T5CheckpointCallback()], adapter_config=adapter_config) if trainer.is_world_process_zero(): arguments = get_training_args( [model_args, data_args, training_args, adapter_args]) handle_metrics("arguments", arguments, training_args.output_dir, training_args.gcs_bucket) # Trains the model. if training_args.do_train: trainer.train( model_path=get_last_checkpoint_path(training_args.output_dir) \ if (os.path.isdir(training_args.output_dir) and not training_args.optimize_from_scratch) else None, ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) tokenizer.save_pretrained(training_args.output_dir) if training_args.save_task_embeddings: for task, task_embedding in model.task_embedding_controller.task_to_embeddings.items( ): create_dir(training_args.save_task_embeddings_dir) np.save( os.path.join(training_args.save_task_embeddings_dir, '{}.npy'.format(task)), task_embedding.data.detach().cpu().numpy()) # Evaluation all_metrics = {} if training_args.do_eval or training_args.do_test: if trainer.is_world_process_zero(): # By default we load the model from last checkpoint path, # in case of saving the model with the best metrics, make sure to # set save_total = 1 so the best model is loaded here. # if not exists returns the path to the output_dir. last_checkpoint_path = get_last_checkpoint_path( training_args.output_dir) config = T5Config.from_pretrained(last_checkpoint_path, cache_dir=model_args.cache_dir) model = T5ForConditionalGeneration.from_pretrained( last_checkpoint_path, from_tf=".ckpt" in training_args.output_dir, config=config, cache_dir=model_args.cache_dir, adapter_config=adapter_config) # NOTE: if trainer is not re-defined, there is a bug in the codes, that making # huggingface codes does not using the best checkpoint. trainer = T5Trainer(model=model, config=config, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_datasets, data_collator=TaskCollator( tokenizer, data_args, tpu_num_cores=training_args.tpu_num_cores), compute_metrics=None, multi_task_compute_metrics=compute_metrics_fn, data_args=data_args, dataset_sizes=dataset_sizes if training_args.do_train else None, callbacks=[T5CheckpointCallback()], adapter_config=adapter_config) if training_args.train_adapters: if adapter_args.adapter_config_name == "adapter" and data_args.adapters is not None: for name, sub_module in model.named_modules(): task_to_adapter = { eval_task: adapter for eval_task, adapter in zip(data_args.eval_tasks, data_args.adapters) } if isinstance(sub_module, AdapterController): sub_module.set_task_to_adapter_map(task_to_adapter) if adapter_args.adapter_config_name in ["meta-adapter"]: # If this is parametric, then the evaluation task should be part of tasks # and the embeddings needs to be trained. if not adapter_args.parametric_task_embedding: model.task_embedding_controller.set_task_embeddings( eval_datasets.keys(), parametric=adapter_args.parametric_task_embedding) if training_args.do_eval: metrics = trainer.evaluate(metric_key_prefix="val") if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir, training_args.gcs_bucket) all_metrics.update(metrics) if training_args.do_test: metrics = trainer.evaluate(test_dataset, metric_key_prefix="test") if trainer.is_world_process_zero(): handle_metrics("test", metrics, training_args.output_dir, training_args.gcs_bucket) all_metrics.update(metrics) return all_metrics
# os.environ["WANDB_DISABLED"] = "false" if args.is_tensorboard else "true" os.environ["TRANSFORMERS_CACHE"] = "../huggingface_cache/" # if cache does not exist, create one if not os.path.exists(os.environ["TRANSFORMERS_CACHE"]): os.makedirs(os.environ["TRANSFORMERS_CACHE"]) training_args = TrainingArguments("tmp_trainer") training_args.no_cuda = args.no_cuda training_args.per_device_eval_batch_size = args.per_device_eval_batch_size training_args.per_gpu_eval_batch_size = args.per_device_eval_batch_size training_args_dict = training_args.to_dict() _n_gpu = training_args_dict["_n_gpu"] del training_args_dict["_n_gpu"] training_args_dict["n_gpu"] = _n_gpu HfParser = HfArgumentParser((TrainingArguments)) training_args = HfParser.parse_dict(training_args_dict)[0] TASK_CONFIG = {"classification": ("text", None)} # Load pretrained model and tokenizer NUM_LABELS = 3 MAX_SEQ_LEN = 128 config = AutoConfig.from_pretrained(args.model_type, num_labels=3, finetuning_task=args.task_name, cache_dir=args.cache_dir) tokenizer = AutoTokenizer.from_pretrained(args.model_type, use_fast=False, cache_dir=args.cache_dir) model = AutoModelForSequenceClassification.from_pretrained( args.model_path,
def main(): config = HyperParametersConfig() # set_os_environ() # if config.do_train and torch.cuda.device_count() > 1: # # 分布式初始化 # torch.distributed.init_process_group(backend="nccl", rank=0, world_size=1, # init_method='tcp://localhost:7002') # set_seed(config.seed) # Trainer内部已经包含 parser = HfArgumentParser( (ModelArguments, DataArguments, TrainingArguments)) # config_dict = HyperParametersConfig().__dict__ # print(config_dict) model_args, data_args, training_args = parser.parse_dict(config.__dict__) logger.info("Load pre-training model.") tokenizer = BertTokenizer.from_pretrained(model_args.model_name_or_path) model = CustomGPTGeneration.from_pretrained(model_args.model_name_or_path) # Get datasets logger.info("Loading dataset.") data = torch.load(data_args.dataset_path) train_dataset = ChineseMedicalDataset( data=data["train"], tokenizer=tokenizer, max_sequence_len=data_args.max_sequence_len, max_condition_len=data_args.max_condition_len, max_target_len=data_args.max_target_len, is_right_pad=data_args.is_right_pad, is_condition_first=data_args.is_condition_first, is_unilm_mask=data_args.is_unilm_mask ) if training_args.do_train else None valid_dataset = ChineseMedicalDataset( data=data["valid"], tokenizer=tokenizer, max_sequence_len=data_args.max_sequence_len, max_condition_len=data_args.max_condition_len, max_target_len=data_args.max_target_len, is_right_pad=data_args.is_right_pad, is_condition_first=data_args.is_condition_first, is_unilm_mask=data_args.is_unilm_mask ) if training_args.do_eval else None logger.info("Initialize Trainer.") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=valid_dataset, ) logger.info("Training start.") if training_args.do_train: logger.info("local rank value: {}".format(training_args.local_rank)) trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Evaluate results *****") for key in sorted(eval_output.keys()): logger.info("{} = {}".format(key, str(eval_output[key]))) writer.write("{} = {}\n".format(key, str(eval_output[key]))) results.update(eval_output) return results
def train_model(dict_args): # parse args dict parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_dict(dict_args) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) logger.info("Data Arguments %s", data_args) logger.info("Model Arguments %s", model_args) # Set seed before initializing model. set_seed(training_args.seed) # load datasets assert os.path.exists(data_args.train_file) and os.path.exists( data_args.validation_file) and os.path.exists(data_args.test_file) datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file }, delimiter="\t", cache_dir=model_args.cache_dir) logger.info("Datasets %s", datasets) logger.info("Column names %s", datasets["train"].column_names) logger.info("Sample example %s", datasets["train"][0]) # get label information text_column_name = "text" label_column_name = "labels" bbox_column_name = "bbox" num_labels, label_to_id, id_to_label = get_label_info( datasets["train"][label_column_name], data_args.task_name) logger.info("num_labels %s", num_labels) logger.info("label_to_id %s", label_to_id) logger.info("id_to_label %s", id_to_label) # Load config, tokenizer and pre-trained model # For Distributed training: The .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. if data_args.task_name == "regression": config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir # for longformer : May adapt the attention_window=512 (default) in config ) else: config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=id_to_label, label2id=label_to_id, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir # for longformer : May adapt the attention_window=512 (default) in config ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, add_prefix_space=True # for roberta tokenizer ) if data_args.task_name == "ner": model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Pre-process the datasets (tokenize words and align labels/bboxes if needed) padding = "max_length" if data_args.pad_to_max_length else False use_bbox = data_args.use_bbox tokenized_datasets = datasets.map( lambda x: preprocess_dataset(x, tokenizer, label_to_id, data_args. label_all_tokens, padding, use_bbox, data_args.task_name), remove_columns=[label_column_name], batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) logger.info("Tokenized datasets %s", tokenized_datasets) logger.info("Column names %s", tokenized_datasets["train"].column_names) logger.info("Sample example %s", tokenized_datasets["train"][0]) # Data collator. Used to pad the inputs of a single batch to the max size of this batch # Not needed if padding has already been done (if pad_to_max_length is true): default_data_collator # This does not work with bboxes. Hence pad_to_max_length is always True when using bboxes if data_args.pad_to_max_length: data_collator = default_data_collator else: if data_args.task_name == "ner": data_collator = DataCollatorForTokenClassification(tokenizer) else: data_collator = None # will default to DataCollatorWithPadding logger.info("Data Collator used %s", data_collator) # Initialize our Trainer if data_args.task_name != "multilabel-classif": trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args .task_name), ) else: trainer = MultilabelClassificationTrainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args .task_name), ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) # we save the final model (last or best) to the sagemaker output folder trainer.save_model(output_dir=data_args.sagemaker_output_path ) # It saves the tokenizer too for easy upload output_train_file = os.path.join(data_args.sagemaker_output_path, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(data_args.sagemaker_output_path, "trainer_state.json")) # We also save the model_args and data_args for future use (training_args are already saved) torch.save( asdict(model_args), os.path.join(data_args.sagemaker_output_path, "model_args.bin")) torch.save( asdict(data_args), os.path.join(data_args.sagemaker_output_path, "data_args.bin")) # Evaluation (This will evaluate the final/best model on the dev set and write results results = {} if training_args.do_eval: logger.info("*** Evaluate best/final model on dev set ***") results = trainer.evaluate() output_eval_file = os.path.join(data_args.sagemaker_output_path, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Evaluate and Predict on test set if training_args.do_predict: logger.info("*** Predict on test set ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict( test_dataset, metric_key_prefix="test") if data_args.task_name == "classif": true_predictions = [ id_to_label[p] for p in np.argmax(predictions, axis=1) ] elif data_args.task_name == "multilabel-classif": predictions = 1 / (1 + np.exp(-predictions)) # sigmoid predictions = (predictions > 0.5) # threshold true_predictions = [[id_to_label[i] for i in np.where(p == 1)[0]] for p in predictions] elif data_args.task_name == "regression": true_predictions = np.squeeze(predictions) elif data_args.task_name == "ner": predictions = np.argmax(predictions, axis=2) true_predictions = [[ id_to_label[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join( data_args.sagemaker_output_path, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") output_test_predictions_file = os.path.join( data_args.sagemaker_output_path, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: if data_args.task_name == "ner": writer.write(" ".join(prediction) + "\n") else: writer.write(str(prediction) + "\n") return results
def setup(argc=None, **kwargs): if argc is None: argc = sys.argv[1:] parser = HfArgumentParser(( ModelArguments, DataTrainingArguments, DirArguments, TrainingArguments, WindowArguments )) if ( isinstance(argc, list) and len(argc) == 1 and argc[0].endswith('.json') ): model_args, data_args, dir_args, training_args, window_args = ( parser.parse_json_file(argc[0]) ) elif isinstance(argc, dict): model_args, data_args, dir_args, training_args, window_args = ( parser.parse_dict(argc) ) else: model_args, data_args, dir_args, training_args, window_args = ( parser.parse_args_into_dataclasses() ) if ( os.path.exists(training_args.output_dir) and [f for f in os.listdir(training_args.output_dir) if f != '.gitignore'] and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) all_args = { 'model_args': model_args, 'data_args': data_args, 'dir_args': dir_args, 'training_args': training_args, 'window_args': window_args, } # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) try: processor = processors[data_args.task_name]() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config_kwargs = kwargs.pop('config_kwargs', {}) tokenizer_kwargs = kwargs.pop('tokenizer_kwargs', {}) model_kwargs = kwargs.pop('model_kwargs', {}) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, **config_kwargs, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, **tokenizer_kwargs, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, **model_kwargs, ) return all_args, processor, config, tokenizer, model
def calculate_sparsity_param(sparsity_desired, parameters_desired, experiment, test_sparsity=False): """ :param sparsity_desired: desired sparsity of model :param parameters_desired: desired number of on-params; can't be used with sparsity_desired :param experiment: name of experiment config with a sparse architecture :param test_sparsity: whether to test the calculated sparsity param, this test loads the model and calculates the resulting sparsity. """ # Ensure sparsity_desired or parameters_desired is specified but not both. assert not (sparsity_desired is None and parameters_desired is None) assert sparsity_desired is not None or parameters_desired is not None print(bold("Initializing model... ") + "(this may take a minute)") print(f" experiment: {experiment}") # Load and parse model args from config. exp_config = CONFIGS[experiment] exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] model_args = replace(model_args, cache_dir=None) # enable to run locally print(bold("\n\nModel parameters:\n") + pdict(model_args.__dict__)) print() # Initialize model. config = init_config(model_args) tokenizer = init_tokenizer(model_args) model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) print(bold("Calculating target sparsity...")) # Get sparse modules and calculate total number of sparsifiable params. sparse_modules = filter_modules(model.bert, include_modules=[SparseWeightsBase]) sparsifiable_params = 0 for _, m in sparse_modules.items(): sparsifiable_params += m.zero_mask.numel() # Calculate the total number of params and the needed sparsity. total_params, _ = count_nonzero_params(model.bert) if parameters_desired is None: parameters_desired = total_params * (1 - sparsity_desired) elif sparsity_desired is None: sparsity_desired = parameters_desired / total_params dense_params = total_params - sparsifiable_params target_sparsity = 1 - (parameters_desired - dense_params) / sparsifiable_params print(f" sparsity_desired: {sparsity_desired}") print(f" parameters_desired: {parameters_desired}") print(f" sparsifiable_params: {sparsifiable_params}") print(f" total_params: {total_params}") print(f" target_sparsity: {target_sparsity} (set your sparsity to this)") print() if not test_sparsity: return print(bold("Testing target sparsity...")) # Edit config to use the new sparsity param (sparsity=target_sparsity). exp_config["config_kwargs"]["sparsity"] = target_sparsity exp_parser = HfArgumentParser(ModelArguments) model_args = exp_parser.parse_dict(exp_config)[0] model_args = replace(model_args, cache_dir=None) # remove to run locally # Initialize model; this time with the new sparsity param. config = init_config(model_args) tokenizer = init_tokenizer(model_args) model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set all on-weights to one to make sure none are randomly off. sparse_modules = filter_modules(model.bert, include_modules=[SparseWeightsBase]) for _, m in sparse_modules.items(): m.weight.data[:] = 1 model.apply(rezero_weights) # set off weights to zero. resulting_sparsity = calc_model_sparsity(model.bert) _, nz_params = count_nonzero_params(model.bert) print( f" Resulting sparsity of model.bert using sparsity={target_sparsity}\n" f" actual_sparsity={resulting_sparsity}\n" f" num_nonzero_params={nz_params}\n") print(f" Note this may not be exactly as desired as there are " "discrete levels of allowable sparsity") print()
def generate_training_args(args, inoculation_step): training_args = TrainingArguments("tmp_trainer") training_args.no_cuda = args.no_cuda training_args.seed = args.seed training_args.do_train = args.do_train training_args.do_eval = args.do_eval training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample") training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch training_args.metric_for_best_model = args.metric_for_best_model training_args.greater_is_better = args.greater_is_better training_args.logging_dir = args.logging_dir training_args.task_name = args.task_name training_args.learning_rate = args.learning_rate training_args.per_device_train_batch_size = args.per_device_train_batch_size training_args.per_device_eval_batch_size = args.per_device_eval_batch_size training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100. training_args.eval_steps = args.eval_steps training_args.logging_steps = args.logging_steps training_args.load_best_model_at_end = args.load_best_model_at_end if args.save_total_limit != -1: # only set if it is specified training_args.save_total_limit = args.save_total_limit import datetime date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day) run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format( args.run_name, args.task_name, args.model_type, date_time, args.max_seq_length, args.learning_rate, args.seed, args.metric_for_best_model ) training_args.run_name = run_name training_args_dict = training_args.to_dict() # for PR _n_gpu = training_args_dict["_n_gpu"] del training_args_dict["_n_gpu"] training_args_dict["n_gpu"] = _n_gpu HfParser = HfArgumentParser((TrainingArguments)) training_args = HfParser.parse_dict(training_args_dict)[0] if args.model_path == "": args.model_path = args.model_type if args.model_type == "": assert False # you have to provide one of them. # Set seed before initializing model. set_seed(training_args.seed) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") return training_args
def main(args_dict=None): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if args_dict is not None: model_args, data_args, training_args = parser.parse_dict(args_dict) elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Set project name os.environ["WANDB_PROJECT"] = "multilingual_zeroshot" num_labels = 3 labels = ['entailment', 'neutral', 'contradiction'] # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = MBartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, dropout=model_args.dropout, attention_dropout=model_args.attention_dropout, finetuning_task="mnli", cache_dir=model_args.cache_dir, ) tokenizer = MBartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = MBartForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets columns = ['input_ids', 'attention_mask', 'labels'] map_fn = get_mnli_map_fn(data_args.lang, data_args.max_seq_length, tokenizer) train_dataset = nlp.load_dataset("multi_nli", split="train") train_dataset = train_dataset.map(map_fn, batched=True, batch_size=512) train_dataset.set_format(type='torch', columns=columns) eval_dataset = (nlp.load_dataset("multi_nli", split="validation_matched") if training_args.do_eval else None) eval_dataset = eval_dataset.map(map_fn, batched=True, batch_size=512) eval_dataset.set_format(type='torch', columns=columns) def compute_metrics_fn(p: EvalPrediction): preds = np.argmax(p.predictions, axis=1) return glue_compute_metrics("classification", preds, p.label_ids) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics_fn, data_collator=DataCollator(tokenizer), ) # disable wandb console logs logging.getLogger('wandb.run_manager').setLevel(logging.WARNING) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) mis_matched_dataset = nlp.load_dataset("multi_nli", split="validation_mismatched") mis_matched_dataset = mis_matched_dataset.map(map_fn, batched=True, batch_size=512) mis_matched_dataset.set_format(type='torch', columns=columns) eval_datasets = [eval_dataset, mis_matched_dataset] for eval_dataset in eval_datasets: trainer.compute_metrics = compute_metrics_fn eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result)
def __init__(self, model_type, model_name_or_path, output_dir, cache_dir, data_dir, train_file_path, predict_file_path, aug_file_path, do_aug, do_alum, alpha, eps, eta, sigma, do_train, do_adv_eval, do_eval, per_device_train_batch_size, per_device_eval_batch_size, gradient_accumulation_steps, eval_all_checkpoints, num_train_epochs, max_steps, save_steps, seed, fp16): args = { "model_type": model_type, "model_name_or_path": model_name_or_path, "output_dir": output_dir, "cache_dir": cache_dir, "data_dir": data_dir, "train_file_path": train_file_path, "predict_file_path": predict_file_path, "aug_file_path": aug_file_path, "do_aug": do_aug, "do_alum": do_alum, "alpha": alpha, "eps": eps, "eta": eta, "sigma": sigma, "do_train": do_train, "do_adv_eval": do_adv_eval, "do_eval": do_eval, "per_device_train_batch_size": per_device_train_batch_size, "per_device_eval_batch_size": per_device_eval_batch_size, "gradient_accumulation_steps": gradient_accumulation_steps, "eval_all_checkpoints": eval_all_checkpoints, "num_train_epochs": num_train_epochs, "max_steps": max_steps, "save_steps": save_steps, "seed": seed, "fp16": fp16, } parser = HfArgumentParser( dataclass_types=[ModelArguments, TrainingArguments]) self.model_args, self.training_args = parser.parse_dict(args) # Load model and tokenizer config, self.model_cls, tokenizer_cls = MODEL_CLASSES[ self.model_args.model_type] self.tokenizer = tokenizer_cls.from_pretrained( self.model_args.tokenizer_name_or_path if self.model_args.tokenizer_name_or_path else self.model_args.model_name_or_path, cache_dir=self.model_args.cache_dir, ) model = self.model_cls.from_pretrained( self.model_args.model_name_or_path, cache_dir=self.model_args.cache_dir, ) # Load training dataset if self.training_args.do_train: train_dataset = load_and_cache_examples(self.model_args, self.tokenizer) else: train_dataset = None # Initialize the Trainer self.trainer = Trainer( model_args=self.model_args, data_collator=None, model=model, tokenizer=self.tokenizer, args=self.training_args, train_dataset=train_dataset, prediction_loss_only=True, )