def dcgan_train(config, checkpoint_dir=None): step = 0 use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") netD = Discriminator().to(device) netD.apply(weights_init) netG = Generator().to(device) netG.apply(weights_init) criterion = nn.BCELoss() optimizerD = optim.Adam(netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): dataloader = get_data_loader() if checkpoint_dir is not None: path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) netD.load_state_dict(checkpoint["netDmodel"]) netG.load_state_dict(checkpoint["netGmodel"]) optimizerD.load_state_dict(checkpoint["optimD"]) optimizerG.load_state_dict(checkpoint["optimG"]) step = checkpoint["step"] if "netD_lr" in config: for param_group in optimizerD.param_groups: param_group["lr"] = config["netD_lr"] if "netG_lr" in config: for param_group in optimizerG.param_groups: param_group["lr"] = config["netG_lr"] while True: lossG, lossD, is_score = train( netD, netG, optimizerG, optimizerD, criterion, dataloader, step, device, config["mnist_model_ref"], ) step += 1 with tune.checkpoint_dir(step=step) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save( { "netDmodel": netD.state_dict(), "netGmodel": netG.state_dict(), "optimD": optimizerD.state_dict(), "optimG": optimizerG.state_dict(), "step": step, }, path, ) tune.report(lossg=lossG, lossd=lossD, is_score=is_score)
def setup(self, config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.netD = Discriminator().to(self.device) self.netD.apply(weights_init) self.netG = Generator().to(self.device) self.netG.apply(weights_init) self.criterion = nn.BCELoss() self.optimizerD = optim.Adam(self.netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) self.optimizerG = optim.Adam(self.netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): self.dataloader = get_data_loader(config.get("data_dir", "~/data")) self.mnist_model_ref = config["mnist_model_ref"]
class PytorchTrainable(tune.Trainable): def setup(self, config): use_cuda = config.get("use_gpu") and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.netD = Discriminator().to(self.device) self.netD.apply(weights_init) self.netG = Generator().to(self.device) self.netG.apply(weights_init) self.criterion = nn.BCELoss() self.optimizerD = optim.Adam( self.netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) self.optimizerG = optim.Adam( self.netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): self.dataloader = get_data_loader() self.mnist_model_ref = config["mnist_model_ref"] def step(self): lossG, lossD, is_score = train(self.netD, self.netG, self.optimizerG, self.optimizerD, self.criterion, self.dataloader, self._iteration, self.device, self.mnist_model_ref) return {"lossg": lossG, "lossd": lossD, "is_score": is_score} def save_checkpoint(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "checkpoint") torch.save({ "netDmodel": self.netD.state_dict(), "netGmodel": self.netG.state_dict(), "optimD": self.optimizerD.state_dict(), "optimG": self.optimizerG.state_dict(), }, path) return checkpoint_dir def load_checkpoint(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "checkpoint") checkpoint = torch.load(path) self.netD.load_state_dict(checkpoint["netDmodel"]) self.netG.load_state_dict(checkpoint["netGmodel"]) self.optimizerD.load_state_dict(checkpoint["optimD"]) self.optimizerG.load_state_dict(checkpoint["optimG"]) def reset_config(self, new_config): if "netD_lr" in new_config: for param_group in self.optimizerD.param_groups: param_group["lr"] = new_config["netD_lr"] if "netG_lr" in new_config: for param_group in self.optimizerG.param_groups: param_group["lr"] = new_config["netG_lr"] self.config = new_config return True def _export_model(self, export_formats, export_dir): if export_formats == [ExportFormat.MODEL]: path = os.path.join(export_dir, "exported_models") torch.save({ "netDmodel": self.netD.state_dict(), "netGmodel": self.netG.state_dict() }, path) return {ExportFormat.MODEL: path} else: raise ValueError("unexpected formats: " + str(export_formats))
def dcgan_train(config): step = 0 use_cuda = config.get("use_gpu") and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") netD = Discriminator().to(device) netD.apply(weights_init) netG = Generator().to(device) netG.apply(weights_init) criterion = nn.BCELoss() optimizerD = optim.Adam(netD.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=config.get("lr", 0.01), betas=(beta1, 0.999)) with FileLock(os.path.expanduser("~/.data.lock")): dataloader = get_data_loader() if session.get_checkpoint(): loaded_checkpoint = session.get_checkpoint() with loaded_checkpoint.as_directory() as loaded_checkpoint_dir: path = os.path.join(loaded_checkpoint_dir, "checkpoint.pt") checkpoint = torch.load(path) netD.load_state_dict(checkpoint["netDmodel"]) netG.load_state_dict(checkpoint["netGmodel"]) optimizerD.load_state_dict(checkpoint["optimD"]) optimizerG.load_state_dict(checkpoint["optimG"]) step = checkpoint["step"] if "netD_lr" in config: for param_group in optimizerD.param_groups: param_group["lr"] = config["netD_lr"] if "netG_lr" in config: for param_group in optimizerG.param_groups: param_group["lr"] = config["netG_lr"] while True: lossG, lossD, is_score = train( netD, netG, optimizerG, optimizerD, criterion, dataloader, step, device, config["mnist_model_ref"], ) step += 1 os.makedirs("my_model", exist_ok=True) torch.save( { "netDmodel": netD.state_dict(), "netGmodel": netG.state_dict(), "optimD": optimizerD.state_dict(), "optimG": optimizerG.state_dict(), "step": step, }, "my_model/checkpoint.pt", ) session.report( { "lossg": lossG, "lossd": lossD, "is_score": is_score }, checkpoint=Checkpoint.from_directory("my_model"), )