def train(): """Trains a BERT ethicality classifer.""" args = transformers.TrainingArguments( "saved_models", evaluation_strategy="epoch", learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'], per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'], weight_decay=config['weight_decay'], load_best_model_at_end=True, metric_for_best_model="f1") train, val, test = get_train_val_test_datasets() trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics) # Train the model. trainer.train() # Display model eval statistics. print(trainer.evaluate()) # Test dataset metrics. trainer.predict(test).metrics
def __init__(self): """Initializes a Inference object.""" # self.model = get_pretrained_model() self.tokenizer = get_tokenizer() self.model = transformers.Trainer(model=get_pretrained_model()) self.summarizer = pipeline( "summarization") # ~1.2 GB download the first time this is run.
def _train_model(self, model, tokenizer, train_dataset, val_dataset, **train_kwargs): data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True) train_args = self._get_train_args(**train_kwargs) trainer = transformers.Trainer(model=model, args=train_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train()
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = transformers.HfArgumentParser( (ModelArguments, ynt.GenernalDataTrainingArguments, transformers.TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) logger.info(f"Model arguments: {model_args}") logger.info(f"Data Training arguments: {data_args}") # Set seed transformers.set_seed(training_args.seed) if data_args.task_name in ynt.genernal_tasks_num_labels: num_labels = ynt.genernal_tasks_num_labels[data_args.task_name] output_mode = ynt.genernal_output_modes[data_args.task_name] elif data_args.task_name in transformers.glue_tasks_num_labels: num_labels = transformers.glue_tasks_num_labels[data_args.task_name] output_mode = transformers.glue_output_modes[data_args.task_name] else: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer config = transformers.AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = transformers.AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) if data_args.task_name in ynt.genernal_tasks_num_labels: train_dataset = (ynt.GenernalDataset(data_args, tokenizer=tokenizer, mode='train', cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (ynt.GenernalDataset(data_args, tokenizer=tokenizer, mode='dev', cache_dir=model_args.cache_dir) if training_args.do_eval and not data_args.online else None) test_dataset = (ynt.GenernalDataset(data_args, tokenizer=tokenizer, mode='test', cache_dir=model_args.cache_dir) if training_args.do_predict and not data_args.online else None) elif data_args.task_name in transformers.glue_tasks_num_labels: # Get datasets train_dataset = (transformers.GlueDataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (transformers.GlueDataset( data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir) if training_args.do_eval else None) test_dataset = (transformers.GlueDataset( data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir) if training_args.do_predict else None) def build_compute_metrics_fn( task_name: str) -> Callable[[transformers.EvalPrediction], Dict]: def compute_metrics_fn(p: transformers.EvalPrediction): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) if task_name in ynt.genernal_tasks_num_labels: return ynt.genernal_compute_metrics(task_name, preds, p.label_ids) elif task_name in transformers.glue_tasks_num_labels: return transformers.glue_compute_metrics( task_name, preds, p.label_ids) return compute_metrics_fn # Initialize our Trainer trainer = transformers.Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn(data_args.task_name), ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") eval_datasets.append( GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="dev", cache_dir=model_args.cache_dir)) for eval_dataset in eval_datasets: trainer.compute_metrics = build_compute_metrics_fn( eval_dataset.args.task_name) eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format( eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] if data_args.task_name == "mnli": mnli_mm_data_args = dataclasses.replace(data_args, task_name="mnli-mm") test_datasets.append( transformers.GlueDataset(mnli_mm_data_args, tokenizer=tokenizer, mode="test", cache_dir=model_args.cache_dir)) for test_dataset in test_datasets: predictions = trainer.predict( test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt") if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format( test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
logging_dir=f"{_dir}/logging", logging_steps=256, dataloader_num_workers=64, evaluation_strategy="steps", eval_steps=256, save_steps=256, fp16=True, fp16_opt_level="O3", learning_rate=5e-4, run_name=_dir, ) model = transformers.AlbertForSequenceClassification.from_pretrained( "albert-large-v2", num_labels=2) tokenizer = transformers.AlbertTokenizerFast.from_pretrained("albert-large-v2") data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=32) trainer = transformers.Trainer( args=args, model=model, tokenizer=tokenizer, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=compute_metrics, ) # In[ ]: trainer.train()
import argparse import transformers parser = argparse.ArgumentParser() parser.add_argument('--vocab', type=str) parser.add_argument('--model', type=str) parser.add_argument('--data', type=str) args = parser.parse_args() tokenizer = transformers.BertTokenizer(vocab_file=args.vocab, do_lower_case=False, do_basic_tokenize=True) model = transformers.BertForMaskedLM.from_pretrained(args.model) dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer, file_path=args.data, block_size=128) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15) train_args = transformers.TrainingArguments( per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}") trainer = transformers.Trainer(model=model, eval_dataset=dataset, data_collator=data_collator, prediction_loss_only=True, args=train_args) eval_output = trainer.evaluate() print(eval_output)
def main( mode: str, num_examples_to_test: int = 5, num_repetitions: int = 4, ) -> List[Dict[str, Any]]: if mode not in ["only-correct", "only-incorrect"]: raise ValueError(f"Unrecognized mode {mode}") task_tokenizer, task_model = misc_utils.create_tokenizer_and_model( constants.MNLI_MODEL_PATH) train_dataset, eval_dataset = misc_utils.create_datasets( task_name="mnli", tokenizer=task_tokenizer) eval_instance_data_loader = misc_utils.get_dataloader(dataset=eval_dataset, batch_size=1, random=False) output_mode = glue_output_modes["mnli"] def build_compute_metrics_fn(task_name: str): def compute_metrics_fn(p): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Most of these arguments are placeholders # and are not really used at all, so ignore # the exact values of these. trainer = transformers.Trainer( model=task_model, args=TrainingArguments(output_dir="./tmp-output", per_device_train_batch_size=128, per_device_eval_batch_size=128, learning_rate=5e-5, logging_steps=100), data_collator=default_data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=build_compute_metrics_fn("mnli"), ) task_model.cuda() num_examples_tested = 0 output_collections = [] for test_index, test_inputs in enumerate(eval_instance_data_loader): if num_examples_tested >= num_examples_to_test: break # Skip when we only want cases of correction prediction but the # prediction is incorrect, or vice versa prediction_is_correct = misc_utils.is_prediction_correct( trainer=trainer, model=task_model, inputs=test_inputs) if mode == "only-correct" and prediction_is_correct is False: continue if mode == "only-incorrect" and prediction_is_correct is True: continue for k, v in test_inputs.items(): if isinstance(v, torch.Tensor): test_inputs[k] = v.to(torch.device("cuda")) # with batch-size 128, 1500 iterations is enough for num_samples in range(700, 1300 + 1, 100): # 7 choices for batch_size in [1, 2, 4, 8, 16, 32, 64, 128]: # 8 choices for repetition in range(num_repetitions): print( f"Running #{test_index} " f"N={num_samples} " f"B={batch_size} " f"R={repetition} takes ...", end=" ") with Timer() as timer: s_test = one_experiment( model=task_model, train_dataset=train_dataset, test_inputs=test_inputs, batch_size=batch_size, random=True, n_gpu=1, device=torch.device("cuda"), damp=constants.DEFAULT_INFLUENCE_HPARAMS["mnli"] ["mnli"]["damp"], scale=constants.DEFAULT_INFLUENCE_HPARAMS["mnli"] ["mnli"]["scale"], num_samples=num_samples) time_elapsed = timer.elapsed print(f"{time_elapsed:.2f} seconds") outputs = { "test_index": test_index, "num_samples": num_samples, "batch_size": batch_size, "repetition": repetition, "s_test": s_test, "time_elapsed": time_elapsed, "correct": prediction_is_correct, } output_collections.append(outputs) remote_utils.save_and_mirror_scp_to_remote( object_to_save=outputs, file_name=f"stest.{mode}.{num_examples_to_test}." f"{test_index}.{num_samples}." f"{batch_size}.{repetition}.pth") num_examples_tested += 1 return output_collections
def main( train_task_name: str, train_heuristic: str, eval_heuristics: Optional[List[str]] = None, num_replicas: Optional[int] = None, use_parallel: bool = True, version: Optional[str] = None, ) -> Dict[str, List[Dict[str, Any]]]: if train_task_name not in ["mnli-2", "hans"]: raise ValueError if eval_heuristics is None: eval_heuristics = DEFAULT_EVAL_HEURISTICS if num_replicas is None: num_replicas = DEFAULT_NUM_REPLICAS if version not in ["new-only-z", "new-only-ztest", "new-z-and-ztest"]: raise ValueError task_tokenizer, task_model = misc_utils.create_tokenizer_and_model( constants.MNLI2_MODEL_PATH) (mnli_train_dataset, mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli-2", tokenizer=task_tokenizer) (hans_train_dataset, hans_eval_dataset) = misc_utils.create_datasets(task_name="hans", tokenizer=task_tokenizer) if train_task_name == "mnli-2": train_dataset = mnli_train_dataset if train_task_name == "hans": train_dataset = hans_train_dataset (s_test_damp, s_test_scale, s_test_num_samples) = influence_helpers.select_s_test_config( trained_on_task_name="mnli-2", train_task_name=train_task_name, eval_task_name="hans", ) hans_helper = HansHelper(hans_train_dataset=hans_train_dataset, hans_eval_dataset=hans_eval_dataset) # We will be running model trained on MNLI-2 # but calculate influences on HANS dataset faiss_index = influence_helpers.load_faiss_index( trained_on_task_name="mnli-2", train_task_name=train_task_name) output_mode = glue_output_modes["mnli-2"] def build_compute_metrics_fn(task_name: str): def compute_metrics_fn(p): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Most of these arguments are placeholders # and are not really used at all, so ignore # the exact values of these. trainer = transformers.Trainer( model=task_model, args=TrainingArguments(output_dir="./tmp-output", per_device_train_batch_size=128, per_device_eval_batch_size=128, learning_rate=5e-5, logging_steps=100), ) output_collections: Dict[str, List] = defaultdict(list) if version == "old": raise ValueError("Deprecated") else: NUM_STEPS = 10 num_total_experiments = (len(EXPERIMENT_TYPES) * num_replicas * len(VERSION_2_NUM_DATAPOINTS_CHOICES) * len(VERSION_2_LEARNING_RATE_CHOICES) * NUM_STEPS) with tqdm(total=num_total_experiments) as pbar: for experiment_type in EXPERIMENT_TYPES: for replica_index in range(num_replicas): (hans_eval_heuristic_inputs, hans_eval_heuristic_raw_inputs ) = hans_helper.sample_batch_of_heuristic( mode="eval", heuristic=train_heuristic, size=EVAL_HEURISTICS_SAMPLE_BATCH_SIZE, return_raw_data=True) misc_utils.move_inputs_to_device( inputs=hans_eval_heuristic_inputs, device=task_model.device) for version_2_num_datapoints in VERSION_2_NUM_DATAPOINTS_CHOICES: for version_2_learning_rate in VERSION_2_LEARNING_RATE_CHOICES: # The model will be used for multiple # steps so `deepcopy` it here. _model = deepcopy(task_model) for step in range(NUM_STEPS): outputs_one_experiment, _model = one_experiment( use_parallel=use_parallel, train_heuristic=train_heuristic, eval_heuristics=eval_heuristics, experiment_type=experiment_type, hans_helper=hans_helper, train_dataset=train_dataset, task_model=_model, faiss_index=faiss_index, s_test_damp=s_test_damp, s_test_scale=s_test_scale, s_test_num_samples=s_test_num_samples, trainer=trainer, version=version, version_2_num_datapoints= version_2_num_datapoints, version_2_learning_rate= version_2_learning_rate, hans_eval_heuristic_inputs= hans_eval_heuristic_inputs, hans_eval_heuristic_raw_inputs= hans_eval_heuristic_raw_inputs, ) output_collections[ f"{experiment_type}-" f"{replica_index}-" f"{version_2_num_datapoints}-" f"{version_2_learning_rate}-"].append( outputs_one_experiment) pbar.update(1) pbar.set_description( f"{experiment_type} #{replica_index}") torch.save( output_collections, f"hans-augmentation-{version}." f"{train_task_name}." f"{train_heuristic}." f"{num_replicas}." f"{use_parallel}.pth") return output_collections
writer = SummaryWriter() training_args = transformers.TrainingArguments( output_dir="models/gpt2/", do_train=True, do_eval=True, evaluate_during_training=True, per_device_train_batch_size=32, per_device_eval_batch_size=32, num_train_epochs=1, logging_first_step=True, save_steps=2000, save_total_limit=2, ) trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_set, eval_dataset=valid_set, prediction_loss_only=True, tb_writer=writer ) trainer.train() # Save Model trainer.save_model("models/gpt2/")
def train(args): logging.basicConfig(level=logging.INFO) tokenizer = transformers.AlbertTokenizer.from_pretrained( 'albert-base-v2', cache_dir=cache_dir) albert_for_math_config = transformers.AlbertConfig( hidden_size=768, num_attention_heads=12, intermediate_size=3072, ) if args['--load']: model = transformers.AlbertForMaskedLM.from_pretrained( args['--load-from']) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], overwrite_output_dir=True, num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) else: model = transformers.AlbertForMaskedLM(albert_for_math_config) training_args = transformers.TrainingArguments( output_dir=args['--save-to'], num_train_epochs=int(args['--max-epoch']), per_gpu_train_batch_size=int(args['--batch-size']), per_gpu_eval_batch_size=int(args['--batch-size']), logging_steps=int(args['--log-every']), save_steps=int(args['--save-every']), save_total_limit=10, learning_rate=float(args['--lr']), seed=int(args['--seed']), ) #load datasets print('Loading Data...') train_data = torch.load( './data/train_data_train-easy_algebra__linear_1d.pt') dev_data = torch.load('./data/dev_data_train-easy_algebra__linear_1d.pt') print('Finished loading data') device = torch.device("cuda:0" if args['--cuda'] else "cpu") model.to(device) trainer = transformers.Trainer( model=model, args=training_args, data_collator=AnswerMaskDataCollator(tokenizer), train_dataset=train_data, eval_dataset=dev_data, prediction_loss_only=True, ) if args['--load']: trainer.train(model_path=args['--load-from']) else: trainer.train()
from cfg import config from data import get_train_val_test_datasets from models import get_model from utils import metrics args = transformers.TrainingArguments( "saved_models", evaluation_strategy = "epoch", learning_rate=config['learning_rate'], per_device_train_batch_size=config['batch_size'], per_device_eval_batch_size=config['batch_size'], num_train_epochs=config['num_epochs'], weight_decay=config['weight_decay'], load_best_model_at_end=True, metric_for_best_model="f1" ) train, val, test = get_train_val_test_datasets() trainer = transformers.Trainer(model=get_model(), args=args, train_dataset=train, eval_dataset=val, compute_metrics=metrics) # Train the model. trainer.train() # Display model eval statistics. print(trainer.evaluate()) # Test dataset metrics. trainer.predict(test).metrics
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = transformers.HfArgumentParser( (ModelArguments, DataTrainingArguments, transformers.TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed transformers.set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = transformers.AutoConfig.from_pretrained( model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = transformers.AutoConfig.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = transformers.CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = transformers.AutoModelForPreTraining.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = transformers.AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
# Load data dlnd_train_dset, dlnd_valid_dset, dlnd_test_dset = DlndData( ).return_datasets() # Load model model = create_model() # Training training_args = transformers.TrainingArguments( evaluation_strategy='epoch', load_best_model_at_end=True, logging_dir='training_logs', logging_first_step=True, logging_steps=10, num_train_epochs=10, output_dir='training_results', per_device_eval_batch_size=BATCH_SIZE, per_device_train_batch_size=BATCH_SIZE, weight_decay=0.01, metric_for_best_model='accuracy', disable_tqdm=True, ) trainer = transformers.Trainer( model=model, args=training_args, compute_metrics=compute_metrics, train_dataset=dlnd_train_dset, eval_dataset=dlnd_valid_dset, callbacks=[LogCallback], ) trainer.train() trainer.evaluate()
def run_training(args, train_data): ## Checkpoint Loading ######################################################## if args.load: if '2700' in args.load: model = transformers.GPTNeoForCausalLM.from_pretrained(args.load) else: model = transformers.GPT2LMHeadModel.from_pretrained(args.load) print(f"Loaded model from {args.load}") else: if "EleutherAI" in args.arch: model = transformers.GPTNeoForCausalLM.from_pretrained(args.arch) else: model = transformers.GPT2LMHeadModel.from_pretrained(args.arch) if args.resume: raise NotImplementedError model = transformers.GPT2LMHeadModel.from_pretrained(args.resume) print(f"Loaded model from {args.resume}") start_epoch = 0 start_iteration = int(args.resume.split("-")[-1]) print("start_iteration = ", start_iteration) else: start_iteration = 0 ## Dataloading ######################################################## train_data.start_iteration = start_iteration ## Start Loop ######################################################## print(f"Starting main loop") training_args = transformers.TrainingArguments( output_dir=args.save_dir, overwrite_output_dir=False, do_train=True, do_eval=False, do_predict=True, evaluation_strategy='no', eval_steps=0, num_train_epochs=args.epochs, per_device_train_batch_size=args.batch_size_per_replica, gradient_accumulation_steps=args.grad_acc_steps, learning_rate=args.lr, weight_decay=0.05, # warmup_steps=args.lr_warmup_steps, # max_grad_norm=100000.0, logging_dir=args.save_dir, logging_first_step=True, logging_steps=args.log_freq, save_steps=args.save_freq, save_total_limit=2, dataloader_drop_last=True, dataloader_num_workers=3, local_rank=args.local_rank, deepspeed=args.deepspeed, fp16=args.fp16, ) trainer = transformers.Trainer( model=model, args=training_args, train_dataset=train_data, ) trainer.remove_callback(transformers.integrations.TensorBoardCallback) trainer.add_callback(CustomTensorBoardCallback()) trainer.train() if args.local_rank == 0: model.save_pretrained(os.path.join(args.save_dir, "final_checkpoint"))
def run_full_influence_functions( mode: str, num_examples_to_test: int, s_test_num_samples: int = 1000) -> Dict[int, Dict[str, Any]]: if mode not in ["only-correct", "only-incorrect"]: raise ValueError(f"Unrecognized mode {mode}") tokenizer, model = misc_utils.create_tokenizer_and_model( constants.MNLI_MODEL_PATH) (mnli_train_dataset, mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli", tokenizer=tokenizer) batch_train_data_loader = misc_utils.get_dataloader(mnli_train_dataset, batch_size=128, random=True) instance_train_data_loader = misc_utils.get_dataloader(mnli_train_dataset, batch_size=1, random=False) eval_instance_data_loader = misc_utils.get_dataloader( dataset=mnli_eval_dataset, batch_size=1, random=False) output_mode = glue_output_modes["mnli"] def build_compute_metrics_fn(task_name: str): def compute_metrics_fn(p): if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(task_name, preds, p.label_ids) return compute_metrics_fn # Most of these arguments are placeholders # and are not really used at all, so ignore # the exact values of these. trainer = transformers.Trainer( model=model, args=TrainingArguments(output_dir="./tmp-output", per_device_train_batch_size=128, per_device_eval_batch_size=128, learning_rate=5e-5, logging_steps=100), data_collator=default_data_collator, train_dataset=mnli_train_dataset, eval_dataset=mnli_eval_dataset, compute_metrics=build_compute_metrics_fn("mnli"), ) params_filter = [ n for n, p in model.named_parameters() if not p.requires_grad ] weight_decay_ignores = ["bias", "LayerNorm.weight"] + [ n for n, p in model.named_parameters() if not p.requires_grad ] model.cuda() num_examples_tested = 0 outputs_collections = {} for test_index, test_inputs in enumerate(eval_instance_data_loader): if num_examples_tested >= num_examples_to_test: break # Skip when we only want cases of correction prediction but the # prediction is incorrect, or vice versa prediction_is_correct = misc_utils.is_prediction_correct( trainer=trainer, model=model, inputs=test_inputs) if mode == "only-correct" and prediction_is_correct is False: continue if mode == "only-incorrect" and prediction_is_correct is True: continue with Timer() as timer: influences, _, s_test = nn_influence_utils.compute_influences( n_gpu=1, device=torch.device("cuda"), batch_train_data_loader=batch_train_data_loader, instance_train_data_loader=instance_train_data_loader, model=model, test_inputs=test_inputs, params_filter=params_filter, weight_decay=constants.WEIGHT_DECAY, weight_decay_ignores=weight_decay_ignores, s_test_damp=5e-3, s_test_scale=1e4, s_test_num_samples=s_test_num_samples, train_indices_to_include=None, s_test_iterations=1, precomputed_s_test=None) outputs = { "test_index": test_index, "influences": influences, "s_test": s_test, "time": timer.elapsed, "correct": prediction_is_correct, } num_examples_tested += 1 outputs_collections[test_index] = outputs remote_utils.save_and_mirror_scp_to_remote( object_to_save=outputs, file_name= f"KNN-recall.{mode}.{num_examples_to_test}.{test_index}.pth") print( f"Status: #{test_index} | {num_examples_tested} / {num_examples_to_test}" ) return outputs_collections
training_arg = transformers.TrainingArguments(num_train_epochs=8, learning_rate=5e-5, output_dir='scratch/adv312/', evaluation_strategy="epoch", per_device_train_batch_size=8) ## TODO: Initialize a transformers.Trainer object and run a Bayesian ## hyperparameter search for at least 5 trials (but not too many) on the ## learning rate. Hint: use the model_init() and ## compute_metrics() methods from finetuning_utils.py as arguments to ## Trainer(). trainer = transformers.Trainer( model_init=finetuning_utils.model_init, args=training_arg, compute_metrics=finetuning_utils.compute_metrics, train_dataset=train_data) ##Use the hp_space parameter in hyperparameter_search() to specify ## your hyperparameter search space. (Note that this parameter takes a function ## as its value.) bestrun = trainer.hyperparameter_search( hp_space=lambda _: {"learning rate": tune.uniform(1e-5, 5e-5)}, n_trials=3, search_alg=BayesOptSearch(), metric='eval_loss', mode='min') ## Also print out the run ID, objective value
def imitator_main(mode: str, num_examples_to_test: int) -> List[Dict[str, Any]]: if mode not in ["only-correct", "only-incorrect"]: raise ValueError(f"Unrecognized mode {mode}") task_tokenizer, task_model = misc_utils.create_tokenizer_and_model( constants.MNLI_MODEL_PATH) imitator_tokenizer, imitator_model = misc_utils.create_tokenizer_and_model( constants.MNLI_IMITATOR_MODEL_PATH) (mnli_train_dataset, mnli_eval_dataset) = misc_utils.create_datasets(task_name="mnli", tokenizer=task_tokenizer) task_model.cuda() imitator_model.cuda() if task_model.training is True or imitator_model.training is True: raise ValueError("One of the model is in training mode") print(task_model.device, imitator_model.device) # Most of these arguments are placeholders # and are not really used at all, so ignore # the exact values of these. trainer = transformers.Trainer( model=task_model, args=TrainingArguments(output_dir="./tmp-output", per_device_train_batch_size=128, per_device_eval_batch_size=128, learning_rate=5e-5, logging_steps=100), ) eval_instance_data_loader = misc_utils.get_dataloader( mnli_eval_dataset, batch_size=1, data_collator=default_data_collator) train_inputs_collections = torch.load( constants.MNLI_TRAIN_INPUT_COLLECTIONS_PATH) inputs_by_label: Dict[str, List[int]] = defaultdict(list) for i in range(len(train_inputs_collections)): label = mnli_train_dataset.label_list[train_inputs_collections[i] ["labels"]] inputs_by_label[label].append(i) outputs_collections = [] for i, test_inputs in enumerate(eval_instance_data_loader): if mode == "only-correct" and i not in CORRECT_INDICES[: num_examples_to_test]: continue if mode == "only-incorrect" and i not in INCORRECT_INDICES[: num_examples_to_test]: continue start_time = time.time() for using_ground_truth in [True, False]: outputs = run_one_imitator_experiment( task_model=task_model, imitator_model=imitator_model, test_inputs=test_inputs, trainer=trainer, train_dataset=mnli_train_dataset, train_inputs_collections=train_inputs_collections, inputs_by_label=inputs_by_label, finetune_using_ground_truth_label=using_ground_truth) outputs["index"] = i outputs_collections.append(outputs) end_time = time.time() print(f"#{len(outputs_collections)}/{len(outputs_collections)}: " f"Elapsed {(end_time - start_time) / 60:.2f}") torch.save(outputs_collections, f"imiator_experiments.{mode}.{num_examples_to_test}.pt") return outputs_collections
def train_bert(corpus_path, hebrew_model=False): """ Bert model training :param corpus_path: Corpus to train Bert on :param hebrew_model: Model in Hebrew or not :return: The name of the new trained model """ language = 'hebrew' if hebrew_model else 'english' df = pd.read_csv(corpus_path) corpus_name = get_corpus_name(corpus_path) print("Preprocess...") if hebrew_model: model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name) else: model_name, vocab, raw_text_file = preprocess_english(df, corpus_name) pass print("Cuda availability :", torch.cuda.is_available()) print("Getting tokenizer...") tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True) model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda') tokenizer.add_tokens(vocab) model.resize_token_embeddings(len(tokenizer)) if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name): shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print("Tokenizing...") dataset = transformers.LineByLineTextDataset( tokenizer=tokenizer, file_path=raw_text_file, block_size=128, ) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = transformers.TrainingArguments( output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name, overwrite_output_dir=True, num_train_epochs=20, per_device_train_batch_size=16, save_steps=300, logging_steps=100, save_total_limit=3, ) trainer = transformers.Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset ) print("Begin training...") os.environ["TOKENIZERS_PARALLELISM"] = "false" trainer.train() trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name) return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
def testModel(self, train_val_split_iterator: typing.Iterator = [ sklearn.model_selection.train_test_split ], **kwargs): logger.info("Starting testing of RobertaModel") num_epochs = kwargs['epochs'] batch_size = kwargs['batch_size'] for i, train_test_split in enumerate(train_val_split_iterator): logger.debug( f'{i}-th enumeration of train_val split iterator under cross validation' ) self.model = self.createModel() # optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) # loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) if callable(getattr(self.model, 'compile', None)): # if tf model train_dataset, val_dataset = self.pipeLine.getEncodedDataset( train_test_split, batch_size=batch_size) # self.model.compile(optimizer=optimizer, loss=loss, metrics=self._registeredMetrics) # self.model.fit(train_dataset, epochs=num_epochs) training_args = transformers.TFTrainingArguments( output_dir= f'./results/{self._modelName}', # output directory num_train_epochs= num_epochs, # total number of training epochs per_device_train_batch_size= batch_size, # batch size per device during training per_device_eval_batch_size= batch_size, # batch size for evaluation warmup_steps=kwargs[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=kwargs[ 'weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs ) trainer = transformers.TFTrainer( model=self. model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset= train_dataset, # tensorflow_datasets training dataset eval_dataset= val_dataset, # tensorflow_datasets evaluation dataset compute_metrics=get_compute_metrics( self._registeredMetrics ) # metrics to compute while training ) else: # if pytorch model train_dataset, val_dataset = self.pipeLine.getEncodedDataset( train_test_split, batch_size=batch_size, tfOrPyTorch=torchOrTFEnum.TORCH) training_args = transformers.TrainingArguments( output_dir= f'./results/{self._modelName}', # output directory num_train_epochs= num_epochs, # total number of training epochs per_device_train_batch_size= batch_size, # batch size per device during training per_device_eval_batch_size= batch_size, # batch size for evaluation warmup_steps=kwargs[ 'warmup_steps'], # number of warmup steps for learning rate scheduler weight_decay=kwargs[ 'weight_decay'], # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=10, ) trainer = transformers.Trainer( model=self. model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset, # evaluation dataset compute_metrics=get_compute_metrics( self._registeredMetrics ) # metrics to compute while training ) trainer.train()