def testOptimizerConfigSGD(): '''Test initialization of SGD''' cfg = optim.SGDConfig() assert cfg.name == 'SGDOptimizer' rtol = 1e-05 assert_allclose(0.001, cfg.lr, rtol=rtol, err_msg="lr mismatch") cfg = optim.SGDConfig(lr=0.002) assert_allclose(0.002, cfg.lr, rtol=rtol, err_msg="lr mismatch") # SGD does not support params with pytest.raises(AssertionError) as e: params = [{'params': ['layer1.weight'], 'lr': 0.1}] optim.SGDConfig(params=params, lr=0.002) assert_allclose(0.002, cfg.lr, rtol=rtol, err_msg="lr mismatch") assert str(e.value) == "'params' must be an empty list for SGD optimizer"
def testToyBertStateDictWrapModelLossFn(): # Common setup seed = 1 torch.manual_seed(seed) onnxruntime.set_seed(seed) # Modeling class LinearModel(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(2, 4) def forward(self, y=None, x=None): if y is not None: return self.linear(x) + y else: return self.linear(x) + torch.ones(2, 4) pt_model = LinearModel() model_desc = { 'inputs': [('x', [2, 2]), ('label', [ 2, ])], 'outputs': [('loss', [], True), ('output', [2, 4])] } optim_config = optim.SGDConfig(lr=0.02) def loss_fn(x, label): return F.nll_loss(F.log_softmax(x, dim=1), label) trainer = orttrainer.ORTTrainer(pt_model, model_desc, optim_config, loss_fn=loss_fn) # Compare resulting state_dict keys before train state_dict = checkpoint.experimental_state_dict(trainer) assert state_dict == {} # Executing train_step() once data = torch.randn(2, 2) label = torch.tensor([0, 1], dtype=torch.int64) trainer.train_step(x=data, label=label) # Compare resulting state_dict keys after train state_dict = checkpoint.experimental_state_dict(trainer) assert state_dict.keys() == {'linear.bias', 'linear.weight'}
def testLRSchedulerUpdateImpl(lr_scheduler, expected_values): # Test tolerance rtol = 1e-04 # Initial state initial_lr = 1 total_steps = 10 warmup = 0.5 optimizer_config = optim.SGDConfig(lr=initial_lr) lr_scheduler = lr_scheduler(total_steps, warmup) # First half is warmup for optimization_step in range(total_steps): # Emulate ORTTRainer.train_step() call that updates its train_step_info train_step_info = TrainStepInfo(optimizer_config=optimizer_config, optimization_step=optimization_step) lr_scheduler._step(train_step_info) lr_list = lr_scheduler.get_last_lr() assert len(lr_list) == 1 assert_allclose(lr_list[0], expected_values[optimization_step], rtol=rtol, err_msg="lr mismatch")
def train_ort_model(epoch=1): device = "cuda" ntokens=28785 bptt = 35 batch_size = 20 initial_lr = 0.001 train_data, val_data, test_data = prepare_data(device, 20, 20) pt_model_path = os.path.join('pt_model.py') pt_model = _utils.import_module_from_file(pt_model_path) model = pt_model.TransformerModel(28785, 200, 2, 200, 2, 0.2).to(device) model_desc = {'inputs': [('input1', [bptt, batch_size]), ('label', [bptt * batch_size])], 'outputs': [('loss', [], True), ('predictions', [bptt, batch_size, ntokens])]} opts = orttrainer.ORTTrainerOptions({'device' : {'id' : device}}) optim_config = optim.SGDConfig(lr=initial_lr) trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts) total_loss = 0. start_time = time.time() for batch, i in enumerate(range(0, train_data.size(0) - 35, bptt)): data, targets = get_batch(train_data, i) output = trainer.train_step(data, targets) total_loss += output[0].item() log_interval = 200 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print('| {} | epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.3f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( device, epoch, batch, len(train_data) // bptt, initial_lr, elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def main(): # Training settings parser = argparse.ArgumentParser(description='MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') # Basic setup args = parser.parse_args() if not args.no_cuda and torch.cuda.is_available(): device = "cuda" else: device = "cpu" torch.manual_seed(args.seed) onnxruntime.set_seed(args.seed) # Data loader train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True) # Modeling model = NeuralNet(784, 500, 10) model_desc = mnist_model_description() optim_config = optim.SGDConfig(lr=args.lr) opts = ORTTrainerOptions({'device': {'id': device}}) trainer = ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts) # Train loop for epoch in range(1, args.epochs + 1): train_with_trainer(args.log_interval, trainer, device, train_loader, epoch) test_with_trainer(trainer, device, test_loader)