# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # Import external libraries. import onnxruntime import pytest import torch from torch.nn.parameter import Parameter # Import ORT modules. from _test_helpers import * from onnxruntime.training.ortmodule import ORTModule torch.manual_seed(1) onnxruntime.set_seed(1) def test_GeLU(): @torch.jit.script def bias_gelu(bias, y): x = bias + y return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) @torch.jit.script def bias_gelu_backward(g, bias, y): x = bias + y tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) ff = 0.5 * x * ( (1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
"--log-interval", type=int, default=200, metavar="N", help= "how many batches to wait before logging training status (default: 200)", ) # Basic setup args = parser.parse_args() if not args.no_cuda and torch.cuda.is_available(): device = "cuda" else: device = "cpu" torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) # Model optim_config = onnxruntime.training.optim.SGDConfig(lr=args.lr) model_desc = transformer_model_description_dynamic_axes() model = TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device) # Preparing data train_data, val_data, test_data = prepare_data(device, args.batch_size, args.test_batch_size) trainer = onnxruntime.training.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss) # Train
def main(): # 1. Basic setup parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--pytorch-only', action='store_true', default=False, help='disables ONNX Runtime training') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--view-graphs', action='store_true', default=False, help='views forward and backward graphs') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--epochs', type=int, default=4, metavar='N', help='number of epochs to train (default: 4)') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--log-interval', type=int, default=40, metavar='N', help='how many batches to wait before logging training status (default: 40)') parser.add_argument('--train-steps', type=int, default=-1, metavar='N', help='number of steps to train. Set -1 to run through whole dataset (default: -1)') parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING', help='Log level (default: WARNING)') parser.add_argument('--num-hidden-layers', type=int, default=1, metavar='H', help='Number of hidden layers for the BERT model. A vanila BERT has 12 hidden layers (default: 1)') parser.add_argument('--data-dir', type=str, default='./cola_public/raw', help='Path to the bert data directory') args = parser.parse_args() # Device (CPU vs CUDA) if torch.cuda.is_available() and not args.no_cuda: device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.log_level) logging.basicConfig(level=numeric_level) # 2. Dataloader train_dataloader, validation_dataloader = load_dataset(args) # 3. Modeling # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. config = AutoConfig.from_pretrained( "bert-base-uncased", num_labels=2, num_hidden_layers=args.num_hidden_layers, output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. config=config, ) if not args.pytorch_only: model = ORTModule(model) # TODO: change it to False to stop saving ONNX models model._save_onnx = True model._save_onnx_prefix = 'BertForSequenceClassification' # Tell pytorch to run this model on the GPU. if torch.cuda.is_available() and not args.no_cuda: model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW(model.parameters(), lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. ) # Authors recommend between 2 and 4 epochs # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, # Default value in run_glue.py num_training_steps = total_steps) # Seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) if torch.cuda.is_available() and not args.no_cuda: torch.cuda.manual_seed_all(args.seed) # 4. Train loop (fine-tune) total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0 for epoch_i in range(0, args.epochs): total_training_time += train(model, optimizer, scheduler, train_dataloader, epoch_i, device, args) if not args.pytorch_only and epoch_i == 0: epoch_0_training = total_training_time test_time, validation_accuracy = test(model, validation_dataloader, device, args) total_test_time += test_time assert validation_accuracy > 0.5 print('\n======== Global stats ========') if not args.pytorch_only: estimated_export = 0 if args.epochs > 1: estimated_export = epoch_0_training - (total_training_time - epoch_0_training)/(args.epochs-1) print(" Estimated ONNX export took: {:.4f}s".format(estimated_export)) else: print(" Estimated ONNX export took: Estimate available when epochs > 1 only") print(" Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export)) print(" Accumulated training took: {:.4f}s".format(total_training_time)) print(" Accumulated validation took: {:.4f}s".format(total_test_time))
def main(): # Training settings parser = argparse.ArgumentParser(description='MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') # Basic setup args = parser.parse_args() if not args.no_cuda and torch.cuda.is_available(): device = "cuda" else: device = "cpu" torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) # Data loader train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) # Modeling model = NeuralNet(784, 500, 10) model_desc = mnist_model_description() optim_config = optim.SGDConfig(lr=args.lr) opts = ORTTrainerOptions({'device': {'id': device}}) trainer = ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts) # Train loop for epoch in range(1, args.epochs + 1): train_with_trainer(args.log_interval, trainer, device, train_loader, epoch) test_with_trainer(trainer, device, test_loader)
def testToyBERTModelLegacyExperimentalLRScheduler(initial_lr, lr_scheduler, legacy_lr_scheduler): ############################################################################ # These tests require hard-coded values for 'total_steps' and 'initial_lr' # ############################################################################ # Common setup total_steps = 10 device = 'cuda' seed = 1 # EXPERIMENTAL API model_desc = bert_model_description() model = load_bert_onnx_model() torch.manual_seed(seed) onnxruntime.set_seed(seed) optim_config = optim.AdamConfig(lr=initial_lr) opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'lr_scheduler': lr_scheduler(total_steps=total_steps, warmup=0.5) }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append( trainer.train_step(*sample_input).cpu().item()) assert_allclose(trainer.options.lr_scheduler.get_last_lr()[0], legacy_lr_scheduler(i)) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( initial_lr) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "AdamOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, get_lr_this_step=legacy_lr_scheduler) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--train-steps', type=int, default=-1, metavar='N', help='number of steps to train. Set -1 to run through whole dataset (default: -1)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--pytorch-only', action='store_true', default=False, help='disables ONNX Runtime training') parser.add_argument('--log-interval', type=int, default=300, metavar='N', help='how many batches to wait before logging training status (default: 300)') parser.add_argument('--view-graphs', action='store_true', default=False, help='views forward and backward graphs') parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING', help='Log level (default: WARNING)') parser.add_argument('--data-dir', type=str, default='./mnist', help='Path to the mnist data directory') args = parser.parse_args() # Common setup torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) if not args.no_cuda and torch.cuda.is_available(): device = "cuda" else: device = "cpu" ## Data loader train_loader = torch.utils.data.DataLoader(datasets.MNIST(args.data_dir, train=True, download=True, transform=transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.batch_size, shuffle=True) test_loader = None if args.test_batch_size > 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.test_batch_size, shuffle=True) # Model architecture model = NeuralNet(input_size=784, hidden_size=500, num_classes=10).to(device) if not args.pytorch_only: print('Training MNIST on ORTModule....') # Just for future debugging debug_options = DebugOptions(save_onnx=False, onnx_prefix='MNIST') model = ORTModule(model, debug_options) # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.log_level) logging.basicConfig(level=numeric_level) else: print('Training MNIST on vanilla PyTorch....') optimizer = torch.optim.SGD(model.parameters(), lr=args.lr) # Train loop total_training_time, total_test_time, epoch_0_training, validation_accuracy = 0, 0, 0, 0 for epoch in range(0, args.epochs): total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch) if not args.pytorch_only and epoch == 0: epoch_0_training = total_training_time if args.test_batch_size > 0: test_time, validation_accuracy = test(args, model, device, my_loss, test_loader) total_test_time += test_time assert validation_accuracy > 0.92 print('\n======== Global stats ========') if not args.pytorch_only: estimated_export = 0 if args.epochs > 1: estimated_export = epoch_0_training - (total_training_time - epoch_0_training)/(args.epochs-1) print(" Estimated ONNX export took: {:.4f}s".format(estimated_export)) else: print(" Estimated ONNX export took: Estimate available when epochs > 1 only") print(" Accumulated training without export took: {:.4f}s".format(total_training_time - estimated_export)) print(" Accumulated training took: {:.4f}s".format(total_training_time)) print(" Accumulated validation took: {:.4f}s".format(total_test_time))
def runBertTrainingTest(gradient_accumulation_steps, use_mixed_precision, allreduce_post_accumulation, use_simple_model_desc=True, use_internel_loss_scale=False): torch.manual_seed(1) onnxruntime.set_seed(1) loss_scaler = LossScaler("ort_test_input_loss_scalar", True) if use_internel_loss_scale else None model, model_desc, device = create_ort_trainer( gradient_accumulation_steps, use_mixed_precision, allreduce_post_accumulation, use_simple_model_desc, loss_scaler) if loss_scaler is None: loss_scaler = LossScaler(model.loss_scale_input_name, True) input_ids_batches = [] segment_ids_batches = [] input_mask_batches = [] masked_lm_labels_batches = [] next_sentence_labels_batches = [] batch_size = 16 num_batches = 8 for batch in range(num_batches): input_ids_batches = [ *input_ids_batches, generate_sample_batch(model_desc.inputs_[0], batch_size, device) ] segment_ids_batches = [ *segment_ids_batches, generate_sample_batch(model_desc.inputs_[1], batch_size, device) ] input_mask_batches = [ *input_mask_batches, generate_sample_batch(model_desc.inputs_[2], batch_size, device) ] masked_lm_labels_batches = [ *masked_lm_labels_batches, generate_sample_batch(model_desc.inputs_[3], batch_size, device) ] next_sentence_labels_batches = [ *next_sentence_labels_batches, generate_sample_batch(model_desc.inputs_[4], batch_size, device) ] lr_batch_list = [ 0.0000000e+00, 4.6012269e-07, 9.2024538e-07, 1.3803681e-06, 1.8404908e-06, 2.3006135e-06, 2.7607362e-06, 3.2208588e-06, 3.6809815e-06 ] actual_losses = [] actual_all_finites = [] for batch_count in range(num_batches): input_ids = generate_sample_batch(model_desc.inputs_[0], batch_size, device) segment_ids = generate_sample_batch(model_desc.inputs_[1], batch_size, device) input_mask = generate_sample_batch(model_desc.inputs_[2], batch_size, device) masked_lm_labels = generate_sample_batch(model_desc.inputs_[3], batch_size, device) next_sentence_labels = generate_sample_batch(model_desc.inputs_[4], batch_size, device) lr = lr_batch_list[batch_count] learning_rate = torch.tensor([lr]).to(device) training_args = [ input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, learning_rate ] if use_mixed_precision: if not use_internel_loss_scale: loss_scale = torch.tensor([loss_scaler.loss_scale_]).to(device) training_args.append(loss_scale) actual_loss = model.train_step(*training_args) if isinstance(actual_loss, (list, tuple)): assert len(actual_loss) == 2 actual_loss, actual_all_finite = actual_loss if not use_internel_loss_scale: loss_scaler.update_loss_scale(actual_all_finite.item()) actual_all_finites = [ *actual_all_finites, actual_all_finite.cpu().numpy().item(0) ] actual_losses = [*actual_losses, actual_loss.cpu().numpy().item(0)] else: loss = model(*training_args) actual_losses = [*actual_losses, loss.cpu().numpy().item(0)] if batch_count == num_batches - 1: # test eval_step api with fetches at the end of the training. # if eval_step is called during the training, it will affect the actual training loss (training session is stateful), eval_loss = model.eval_step(input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels, fetches=['loss']) eval_loss = eval_loss.cpu().numpy().item(0) # If using internal loss scale, all_finites are handled internally too. if use_mixed_precision and not use_internel_loss_scale: return actual_losses, actual_all_finites, eval_loss else: return actual_losses, eval_loss
def testToyBERTModelMixedPrecisionLossScalerLegacyExperimental( loss_scaler, legacy_loss_scaler): # Common setup total_steps = 10 device = "cuda" seed = 1 # EXPERIMENTAL IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) model_desc = bert_model_description() model = load_bert_onnx_model() optim_config = optim.LambConfig() opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'mixed_precision': { 'enabled': True, 'loss_scaler': loss_scaler } }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append( trainer.train_step(*sample_input).cpu().item()) # LEGACY IMPLEMENTATION device = torch.device(device) legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params( optim_config.lr) torch.manual_seed(seed) onnxruntime.set_seed(seed) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "LambOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, use_mixed_precision=True, loss_scaler=legacy_loss_scaler) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input, learning_rate) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses, rtol=1e-5)
def testToyBERTModelLegacyExperimentalLRScheduler(initial_lr, lr_scheduler, legacy_lr_scheduler): ############################################################################ # These tests require hard-coded values for 'total_steps' and 'initial_lr' # ############################################################################ # Common setup total_steps = 128 device = 'cuda' seed = 1 warmup = 0.05 cycles = 0.5 power = 1. lr_end = 1e-7 # Setup both Experimental and Legacy LR Schedulers before the experimental loop if legacy_lr_scheduler == _test_commons.legacy_constant_lr_scheduler or legacy_lr_scheduler == _test_commons.legacy_linear_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup) elif legacy_lr_scheduler == _test_commons.legacy_cosine_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup, cycles=cycles) elif legacy_lr_scheduler == _test_commons.legacy_poly_lr_scheduler: legacy_lr_scheduler = partial(legacy_lr_scheduler, initial_lr=initial_lr, total_steps=total_steps, warmup=warmup, power=power, lr_end=lr_end) else: raise RuntimeError("Invalid legacy_lr_scheduler") if lr_scheduler == optim.lr_scheduler.ConstantWarmupLRScheduler or lr_scheduler == optim.lr_scheduler.LinearWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup) elif lr_scheduler == optim.lr_scheduler.CosineWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup, cycles=cycles) elif lr_scheduler == optim.lr_scheduler.PolyWarmupLRScheduler: lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup, power=power, lr_end=lr_end) else: raise RuntimeError("Invalid lr_scheduler") # EXPERIMENTAL API model_desc = bert_model_description() model = load_bert_onnx_model() torch.manual_seed(seed) onnxruntime.set_seed(seed) optim_config = optim.AdamConfig(lr=initial_lr) opts = orttrainer.ORTTrainerOptions({ 'debug' : { 'deterministic_compute': True }, 'device': { 'id': device, }, 'lr_scheduler' : lr_scheduler }) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts) experimental_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) experimental_losses.append(trainer.train_step(*sample_input).cpu().item()) assert_allclose(trainer.options.lr_scheduler.get_last_lr()[0], legacy_lr_scheduler(i)) # LEGACY IMPLEMENTATION torch.manual_seed(seed) onnxruntime.set_seed(seed) device = torch.device(device) model = load_bert_onnx_model() legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(initial_lr) legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, "AdamOptimizer", None, learning_rate_description, device, _use_deterministic_compute=True, get_lr_this_step=legacy_lr_scheduler) legacy_losses = [] for i in range(total_steps): sample_input = generate_random_input_from_model_desc(model_desc, i) leg_loss = legacy_trainer.train_step(*sample_input) legacy_losses.append(leg_loss.cpu().item()) # Check results _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def run_multiple_choice(self, model_name, task_name, fp16, use_new_api): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = DataTrainingArguments(task_name=task_name, data_dir=self.data_dir, max_seq_length=self.max_seq_length) training_args = TrainingArguments(output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, per_gpu_eval_batch_size=self.eval_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs,local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: processor = SwagProcessor() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None ) eval_dataset = ( MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} if model_name.startswith('bert'): model_desc = ModelDescription([ IODescription('input_ids', ['batch', num_labels, 'max_seq_len_in_batch']), IODescription('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch']), IODescription('token_type_ids', ['batch', num_labels, 'max_seq_len_in_batch']), IODescription('labels', ['batch', num_labels])], [ IODescription('loss', []), IODescription('reshaped_logits', ['batch', num_labels])]) new_model_desc = { 'inputs': [ ('input_ids', ['batch', num_labels, 'max_seq_len_in_batch'],), ('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'],), ('token_type_ids', ['batch', num_labels, 'max_seq_len_in_batch'],), ('labels', ['batch', num_labels],)], 'outputs': [('loss', [], True), ('reshaped_logits', ['batch', num_labels])]} else: model_desc = ModelDescription([ IODescription('input_ids', ['batch', num_labels, 'max_seq_len_in_batch']), IODescription('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch']), IODescription('labels', ['batch', num_labels])], [ IODescription('loss', []), IODescription('reshaped_logits', ['batch', num_labels])]) new_model_desc = { 'inputs': [ ('input_ids', ['batch', num_labels, 'max_seq_len_in_batch'],), ('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'],), ('labels', ['batch', num_labels],)], 'outputs': [('loss', [], True), ('reshaped_logits', ['batch', num_labels])]} # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, new_model_desc=new_model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, use_new_api=use_new_api ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format(data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def override_torch_manual_seed(seed): set_seed(int(seed % sys.maxsize)) return torch_manual_seed(seed)
) from e # Verify whether PyTorch C++ extensions are already compiled # TODO: detect when installed extensions are outdated and need reinstallation. Hash? Version file? if not is_torch_cpp_extensions_installed(ORTMODULE_TORCH_CPP_DIR) and "-m" not in sys.argv: _FALLBACK_INIT_EXCEPTION = wrap_exception( ORTModuleInitException, RuntimeError( f"ORTModule's extensions were not detected at '{ORTMODULE_TORCH_CPP_DIR}' folder. " "Run `python -m torch_ort.configure` before using `ORTModule` frontend." ), ) # Initalized ORT's random seed with pytorch's initial seed # in case user has set pytorch seed before importing ORTModule set_seed((torch.initial_seed() % sys.maxsize)) # Override torch.manual_seed and torch.cuda.manual_seed def override_torch_manual_seed(seed): set_seed(int(seed % sys.maxsize)) return torch_manual_seed(seed) torch_manual_seed = torch.manual_seed torch.manual_seed = override_torch_manual_seed def override_torch_cuda_manual_seed(seed): set_seed(int(seed % sys.maxsize)) return torch_cuda_manual_seed(seed)
def do_pretrain(args): if is_main_process(args) and args.tensorboard_dir: tb_writer = SummaryWriter(log_dir=args.tensorboard_dir) tb_writer.add_text("args", args.to_json_string()) tb_writer.add_hparams(args.to_sanitized_dict(), metric_dict={}) else: tb_writer = None random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) ort.set_seed(args.seed) device, args = setup_training(args) model = prepare_model(args, device) logger.info("Running training: Batch size = %d, initial LR = %f", args.train_batch_size, args.learning_rate) most_recent_ckpts_paths = [] average_loss = 0.0 epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f] files.sort() random.shuffle(files) f_id = 0 train_dataloader, data_file = create_pretraining_dataset( get_data_file(f_id, args.world_rank, args.world_size, files), args.max_predictions_per_seq, args) for f_id in range(1 , len(files)): logger.info("data file %s" % (data_file)) dataset_future = pool.submit( create_pretraining_dataset, get_data_file(f_id, args.world_rank, args.world_size, files), args.max_predictions_per_seq, args) train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process(args) else train_dataloader for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch loss, _, _ = model.train_step(input_ids, input_mask, segment_ids, masked_lm_labels, next_sentence_labels) average_loss += loss.item() global_step = model._train_step_info.optimization_step if training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(args): divisor = args.log_freq * args.gradient_accumulation_steps if tb_writer: lr = model.options.lr_scheduler.get_last_lr()[0] tb_writer.add_scalar('train/summary/scalar/Learning_Rate', lr, global_step) if args.fp16: tb_writer.add_scalar('train/summary/scalar/loss_scale_25', loss, global_step) # TODO: ORTTrainer to expose all_finite # tb_writer.add_scalar('train/summary/scalar/all_fp16_gradients_finite_859', all_finite, global_step) tb_writer.add_scalar('train/summary/total_loss', average_loss / divisor, global_step) print("Step:{} Average Loss = {}".format(global_step, average_loss / divisor)) if global_step >= args.max_steps or global_step >= force_to_stop_max_steps: if tb_writer: tb_writer.close() final_loss = average_loss / (args.log_freq * args.gradient_accumulation_steps) return final_loss average_loss = 0 del train_dataloader train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def main(): # Training settings parser = argparse.ArgumentParser(description='ONNX Runtime MNIST Example') parser.add_argument( '--train-steps', type=int, default=-1, metavar='N', help= 'number of steps to train. Set -1 to run through whole dataset (default: -1)' ) parser.add_argument('--batch-size', type=int, default=20, metavar='N', help='input batch size for training (default: 20)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 1)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-path', type=str, default='', help='Path for Saving the current Model state') # Basic setup args = parser.parse_args() if not args.no_cuda and torch.cuda.is_available(): device = "cuda" else: device = "cpu" torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) # Data loader train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) if args.test_batch_size > 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) # Modeling model = NeuralNet(784, 500, 10) model_desc = mnist_model_description() optim_config = optim.SGDConfig(lr=args.lr) opts = {'device': {'id': device}} opts = ORTTrainerOptions(opts) trainer = ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts) # Train loop for epoch in range(1, args.epochs + 1): train(args.log_interval, trainer, device, train_loader, epoch, args.train_steps) if args.test_batch_size > 0: test(trainer, device, test_loader) # Save model if args.save_path: torch.save(model.state_dict(), os.path.join(args.save_path, "mnist_cnn.pt"))
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument( '--train-steps', type=int, default=-1, metavar='N', help= 'number of steps to train. Set -1 to run through whole dataset (default: -1)' ) parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--pytorch-only', action='store_true', default=False, help='disables ONNX Runtime training') parser.add_argument( '--log-interval', type=int, default=300, metavar='N', help= 'how many batches to wait before logging training status (default: 300)' ) parser.add_argument('--view-graphs', action='store_true', default=False, help='views forward and backward graphs') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument( '--log-level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], default='WARNING', help='Log level (default: WARNING)') parser.add_argument('--data-dir', type=str, default='./mnist', help='Path to the mnist data directory') # DeepSpeed-related settings parser.add_argument('--local_rank', type=int, required=True, help='local rank passed from distributed launcher') parser = deepspeed.add_config_arguments(parser) args = parser.parse_args() # Common setup torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) # TODO: CUDA support is broken due to copying from PyTorch into ORT if not args.no_cuda and torch.cuda.is_available(): device = "cuda:" + str(args.local_rank) else: device = "cpu" ## Data loader dist.init_process_group(backend='nccl') if args.local_rank == 0: # download only once on rank 0 datasets.MNIST(args.data_dir, download=True) dist.barrier() train_set = datasets.MNIST(args.data_dir, train=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) test_loader = None if args.test_batch_size > 0: test_loader = torch.utils.data.DataLoader( datasets.MNIST(args.data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) # Model architecture model = NeuralNet(input_size=784, hidden_size=500, num_classes=10).to(device) if not args.pytorch_only: print('Training MNIST on ORTModule....') model = ORTModule(model) # TODO: change it to False to stop saving ONNX models model._save_onnx = True model._save_onnx_prefix = 'MNIST' # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.log_level) logging.basicConfig(level=numeric_level) else: print('Training MNIST on vanilla PyTorch....') model, optimizer, train_loader, _ = deepspeed.initialize( args=args, model=model, model_parameters=[p for p in model.parameters() if p.requires_grad], training_data=train_set) # Train loop total_training_time, total_test_time, epoch_0_training = 0, 0, 0 for epoch in range(0, args.epochs): total_training_time += train(args, model, device, optimizer, my_loss, train_loader, epoch) if not args.pytorch_only and epoch == 0: epoch_0_training = total_training_time if args.test_batch_size > 0: total_test_time += test(args, model, device, my_loss, test_loader) print('\n======== Global stats ========') if not args.pytorch_only: estimated_export = 0 if args.epochs > 1: estimated_export = epoch_0_training - ( total_training_time - epoch_0_training) / (args.epochs - 1) print(" Estimated ONNX export took: {:.4f}s".format( estimated_export)) else: print( " Estimated ONNX export took: Estimate available when epochs > 1 only" ) print(" Accumulated training without export took: {:.4f}s".format( total_training_time - estimated_export)) print(" Accumulated training took: {:.4f}s".format( total_training_time)) print(" Accumulated validation took: {:.4f}s".format( total_test_time))