def run(self, hparams): transform = T.Compose( [T.ToTensor(), T.Normalize((0.1307, ), (0.3081, ))]) if self.is_global_zero: MNIST("./data", download=True) self.barrier() train_dataset = MNIST("./data", train=True, transform=transform) test_dataset = MNIST("./data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader(train_dataset, hparams.batch_size) test_loader = torch.utils.data.DataLoader(test_dataset, hparams.test_batch_size) train_loader, test_loader = self.setup_dataloaders( train_loader, test_loader) model = Net() optimizer = optim.Adadelta(model.parameters(), lr=hparams.lr) model, optimizer = self.setup(model, optimizer) scheduler = StepLR(optimizer, step_size=1, gamma=hparams.gamma) MainLoop(self, hparams, model, optimizer, scheduler, train_loader, test_loader).run() if hparams.save_model and self.is_global_zero: self.save(model.state_dict(), "mnist_cnn.pt")
def run(self, hparams): self.hparams = hparams seed_everything(hparams.seed) # instead of torch.manual_seed(...) transform = T.Compose( [T.ToTensor(), T.Normalize((0.1307, ), (0.3081, ))]) # This is meant to ensure the data are download only by 1 process. if self.is_global_zero: MNIST("./data", download=True) self.barrier() train_dataset = MNIST("./data", train=True, transform=transform) test_dataset = MNIST("./data", train=False, transform=transform) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=hparams.batch_size, ) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=hparams.batch_size) # don't forget to call `setup_dataloaders` to prepare for dataloaders for distributed training. train_loader, test_loader = self.setup_dataloaders( train_loader, test_loader) model = Net() # remove call to .to(device) optimizer = optim.Adadelta(model.parameters(), lr=hparams.lr) # don't forget to call `setup` to prepare for model / optimizer for distributed training. # the model is moved automatically to the right device. model, optimizer = self.setup(model, optimizer) scheduler = StepLR(optimizer, step_size=1, gamma=hparams.gamma) # use torchmetrics instead of manually computing the accuracy test_acc = Accuracy().to(self.device) # EPOCH LOOP for epoch in range(1, hparams.epochs + 1): # TRAINING LOOP model.train() for batch_idx, (data, target) in enumerate(train_loader): # NOTE: no need to call `.to(device)` on the data, target optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) self.backward(loss) # instead of loss.backward() optimizer.step() if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0): print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}". format( epoch, batch_idx * len(data), len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item(), )) if hparams.dry_run: break scheduler.step() # TESTING LOOP model.eval() test_loss = 0 with torch.no_grad(): for data, target in test_loader: # NOTE: no need to call `.to(device)` on the data, target output = model(data) test_loss += F.nll_loss(output, target, reduction="sum").item() # WITHOUT TorchMetrics # pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability # correct += pred.eq(target.view_as(pred)).sum().item() # WITH TorchMetrics test_acc(output, target) if hparams.dry_run: break # all_gather is used to aggregated the value across processes test_loss = self.all_gather(test_loss).sum() / len( test_loader.dataset) print( f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({test_acc.compute():.0f}%)\n" ) test_acc.reset() if hparams.dry_run: break # When using distributed training, use `self.save` # to ensure the current process is allowed to save a checkpoint if hparams.save_model: self.save(model.state_dict(), "mnist_cnn.pt")
def __init__(self, model=None, lr=1.0, gamma=0.7, batch_size=32): super().__init__() self.save_hyperparameters(ignore="model") self.model = model or Net() self.test_acc = Accuracy()
def run(self, hparams): self.hparams = hparams seed_everything(hparams.seed) # instead of torch.manual_seed(...) self.model = Net() [optimizer], [scheduler] = self.configure_optimizers() model, optimizer = self.setup(self.model, optimizer) if self.is_global_zero: # In multi-device training, this code will only run on the first process / GPU self.prepare_data() train_loader, test_loader = self.setup_dataloaders( self.train_dataloader(), self.train_dataloader()) self.test_acc = Accuracy().to(self.device) # EPOCH LOOP for epoch in range(1, hparams.epochs + 1): # TRAINING LOOP self.model.train() for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() loss = self.training_step(batch, batch_idx) self.backward(loss) optimizer.step() if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0): print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}". format( epoch, (batch_idx + 1) * self.hparams.batch_size, len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item(), )) if hparams.dry_run: break scheduler.step() # TESTING LOOP self.model.eval() test_loss = 0 with torch.no_grad(): for batch_idx, batch in enumerate(test_loader): test_loss += self.test_step(batch, batch_idx) if hparams.dry_run: break test_loss = self.all_gather(test_loss).sum() / len( test_loader.dataset) print( f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({self.test_acc.compute():.0f}%)\n" ) self.test_acc.reset() if hparams.dry_run: break if hparams.save_model: self.save(model.state_dict(), "mnist_cnn.pt")
class Lite(LightningLite): """Lite is starting to look like a LightningModule.""" def run(self, hparams): self.hparams = hparams seed_everything(hparams.seed) # instead of torch.manual_seed(...) self.model = Net() [optimizer], [scheduler] = self.configure_optimizers() model, optimizer = self.setup(self.model, optimizer) if self.is_global_zero: # In multi-device training, this code will only run on the first process / GPU self.prepare_data() train_loader, test_loader = self.setup_dataloaders( self.train_dataloader(), self.train_dataloader()) self.test_acc = Accuracy().to(self.device) # EPOCH LOOP for epoch in range(1, hparams.epochs + 1): # TRAINING LOOP self.model.train() for batch_idx, batch in enumerate(train_loader): optimizer.zero_grad() loss = self.training_step(batch, batch_idx) self.backward(loss) optimizer.step() if (batch_idx == 0) or ((batch_idx + 1) % hparams.log_interval == 0): print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}". format( epoch, (batch_idx + 1) * self.hparams.batch_size, len(train_loader.dataset), 100.0 * batch_idx / len(train_loader), loss.item(), )) if hparams.dry_run: break scheduler.step() # TESTING LOOP self.model.eval() test_loss = 0 with torch.no_grad(): for batch_idx, batch in enumerate(test_loader): test_loss += self.test_step(batch, batch_idx) if hparams.dry_run: break test_loss = self.all_gather(test_loss).sum() / len( test_loader.dataset) print( f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: ({self.test_acc.compute():.0f}%)\n" ) self.test_acc.reset() if hparams.dry_run: break if hparams.save_model: self.save(model.state_dict(), "mnist_cnn.pt") # Methods for the `LightningModule` conversion def forward(self, x): return self.model(x) def training_step(self, batch, batch_idx): """Here you compute and return the training loss and compute extra training metrics.""" x, y = batch logits = self.forward(x) loss = F.nll_loss(logits, y.long()) return loss def test_step(self, batch, batch_idx): """Here you compute and return the testing loss and compute extra testing metrics.""" x, y = batch logits = self.forward(x) loss = F.nll_loss(logits, y.long()) self.test_acc(logits, y.long()) return loss def configure_optimizers(self): optimizer = optim.Adadelta(self.model.parameters(), lr=self.hparams.lr) return [optimizer ], [StepLR(optimizer, step_size=1, gamma=self.hparams.gamma)] # Methods for the `LightningDataModule` conversion @property def transform(self): return T.Compose([T.ToTensor(), T.Normalize((0.1307, ), (0.3081, ))]) def prepare_data(self) -> None: MNIST("./data", download=True) def train_dataloader(self): train_dataset = MNIST("./data", train=True, download=False, transform=self.transform) return torch.utils.data.DataLoader(train_dataset, batch_size=self.hparams.batch_size) def test_dataloader(self): test_dataset = MNIST("./data", train=False, download=False, transform=self.transform) return torch.utils.data.DataLoader(test_dataset, batch_size=self.hparams.batch_size)