def set_scheduler(optimizer, scheduler_name, **kwargs): """ Set the scheduler on learning rate for the optimizer. Reference: https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate """ if scheduler_name == "LambdaLR": scheduler = lr_scheduler.LambdaLR(optimizer, **kwargs) elif scheduler_name == "MultiplicativeLR": scheduler = lr_scheduler.MultiplicativeLR(optimizer, **kwargs) elif scheduler_name == "StepLR": scheduler = lr_scheduler.StepLR(optimizer, **kwargs) elif scheduler_name == "MultiStepLR": scheduler = lr_scheduler.MultiStepLR(optimizer, **kwargs) elif scheduler_name == "ExponentialLR": scheduler = lr_scheduler.ExponentialLR(optimizer, **kwargs) elif scheduler_name == "CosineAnnealingLR": scheduler = lr_scheduler.CosineAnnealingLR(optimizer, **kwargs) elif scheduler_name == "ReduceLROnPlateau": scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, **kwargs) else: msg = ("Unknown name of the scheduler {}, should be one of" " {{LambdaLR, MultiplicativeLR, StepLR, MultiStepLR," " ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau," " CyclicLR, OneCycleLR, CosineAnnealingWarmRestarts}}.") raise NotImplementedError(msg.format(scheduler_name)) return scheduler
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: distributions.Bernoulli(probs=x).sample()) ]) train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=True, download=True, transform=transform), batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data', train=False, download=True, transform=transform), batch_size=args.batch_size) model = MODEL_MAP[args.model](in_channels=1) optimizer = optim.Adam(model.parameters()) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lambda _: 0.9984) criterion = nn.BCELoss(reduction='none') def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) return criterion(preds, x).sum(dim=1).mean() trainer = pg.trainer.Trainer(model, loss_fn, optimizer, train_loader, test_loader, lr_scheduler=scheduler, log_dir=args.log_dir, save_checkpoint_epochs=1) trainer.interleaved_train_and_eval(n_epochs=args.n_epochs)
def reproduce(n_epochs=457, batch_size=256, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from pytorch_generative import datasets from pytorch_generative import models from pytorch_generative import trainer train_loader, test_loader = debug_loader, debug_loader if train_loader is None: train_loader, test_loader = datasets.get_mnist_loaders( batch_size, dynamically_binarize=True) model = models.PixelCNN( in_channels=1, out_channels=1, n_residual=15, residual_channels=16, head_channels=32, ) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) loss = F.binary_cross_entropy_with_logits(preds, x, reduction="none") return loss.sum(dim=1).mean() trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, device=device, ) trainer.interleaved_train_and_eval(n_epochs)
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: distributions.Bernoulli(probs=x).sample()) ]) ################################### ##### Load MNISIT #### #train_loader = torch.utils.data.DataLoader( # datasets.MNIST('./data', train=True, download=True, transform=transform), # batch_size=args.batch_size, shuffle=True) #test_loader = torch.utils.data.DataLoader( # datasets.MNIST('./data', train=False, download=True, transform=transform), # batch_size=args.batch_size) ##### Load ImageNet #### path_train = "/home/dsi/eyalbetzalel/pytorch-generative-v2/pytorch-generative-v2/imagenet64/train" datasetTrain = datasets.ImageFolder(path_train, transform=transform) path_test = "/home/dsi/eyalbetzalel/pytorch-generative-v2/pytorch-generative-v2/imagenet64/test" datasetTest = datasets.ImageFolder(path_test, transform=transform) print("Loading ImageNet Dataset (Long)") train_loader = torch.utils.data.DataLoader(datasetTrain, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasetTest, batch_size=args.batch_size) print("Finish Loading ImageNet Dataset") ################################### model = MODEL_MAP[args.model](in_channels=1) optimizer = optim.Adam(model.parameters()) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lambda _: 0.9984) criterion = nn.BCELoss(reduction='none') def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) return criterion(preds, x).sum(dim=1).mean() trainer = pg.trainer.Trainer(model, loss_fn, optimizer, train_loader, test_loader, lr_scheduler=scheduler, log_dir=args.log_dir, save_checkpoint_epochs=1) trainer.interleaved_train_and_eval(n_epochs=args.n_epochs)
def __init__( self, lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]], last_epoch: int = -1, step_on_batch: bool = False, ): """Constructor for MultiplicativeLR.""" super().__init__( lambda opt: _schedulers.MultiplicativeLR( opt, lr_lambda, last_epoch=last_epoch), step_on_batch=step_on_batch, )
def __init__(self, lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]], last_epoch: int = -1, step_on_iteration: bool = False): from distutils.version import LooseVersion if LooseVersion(torch.__version__) >= LooseVersion("1.4.0"): super().__init__(lambda opt: _scheduler.MultiplicativeLR( opt, lr_lambda, last_epoch=last_epoch), step_on_iteration=step_on_iteration) else: raise ImportError("Update torch>=1.4.0 to use 'MultiplicativeLR'")
def __init__( self, lr_lambda: Union[Callable[[int], float], List[Callable[[int], float]]], last_epoch: int = -1, step_on_batch: bool = False, ): """Constructor for MultiplicativeLR. Args: lr_lambda (function or list of functions): A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in an optimizer.param_groups. last_epoch (int): The index of last epoch. Default: -1. step_on_batch (bool): Step on each training iteration rather than each epoch. Defaults to False. """ super().__init__( lambda opt: _schedulers.MultiplicativeLR( opt, lr_lambda, last_epoch=last_epoch), step_on_batch=step_on_batch, )
def reproduce(n_epochs=457, batch_size=128, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from torch.utils import data from torchvision import datasets from torchvision import transforms from pytorch_generative import trainer from pytorch_generative import models transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_loader = data.DataLoader( datasets.CIFAR10("tmp/data", train=True, download=True, transform=transform), batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=2, ) test_loader = data.DataLoader( datasets.CIFAR10("tmp/data", train=False, download=True, transform=transform), batch_size=batch_size, pin_memory=True, num_workers=2, ) model = models.VQVAE( in_channels=3, out_channels=3, hidden_channels=128, residual_channels=32, n_residual_blocks=2, n_embeddings=512, embedding_dim=64, ) optimizer = optim.Adam(model.parameters(), lr=2e-4) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): preds, vq_loss = preds recon_loss = F.mse_loss(preds, x) loss = recon_loss + vq_loss return { "vq_loss": vq_loss, "reconstruction_loss": recon_loss, "loss": loss, } model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, device=device, ) model_trainer.interleaved_train_and_eval(n_epochs)
def reproduce( n_epochs=457, batch_size=64, log_dir="/tmp/run", n_gpus=1, device_id=0, debug_loader=None, ): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. n_gpus: Number of GPUs to use for training the model. If 0, uses CPU. device_id: The device_id of the current GPU when training on multiple GPUs. debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from pytorch_generative import datasets from pytorch_generative import models from pytorch_generative import trainer train_loader, test_loader = debug_loader, debug_loader if train_loader is None: train_loader, test_loader = datasets.get_mnist_loaders( batch_size, dynamically_binarize=True) model = models.ImageGPT( in_channels=1, out_channels=1, in_size=28, n_transformer_blocks=8, n_attention_heads=2, n_embedding_channels=64, ) optimizer = optim.Adam(model.parameters(), lr=5e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) loss = F.binary_cross_entropy_with_logits(preds, x, reduction="none") return loss.sum(dim=1).mean() model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, n_gpus=n_gpus, device_id=device_id, ) model_trainer.interleaved_train_and_eval(n_epochs)
shuffle=False, num_workers=0) print('Creating eval loader') eval_set = ElisaDataset('elisadata/standard', 'EVALUATE') eval_loader = data.DataLoader(dataset=eval_set, batch_size=args.eval_batch_size, shuffle=False, num_workers=0) elisa_net = network.ElisaNet(args.c_feat).cuda() params = [{'params': elisa_net.parameters()}] solver = optim.Adam(params, lr=args.lr) lmda = lambda x: 0.5 # TODO: can change this based on bad_epochs scheduler = LS.MultiplicativeLR(solver, lr_lambda=lmda) es = EarlyStopping(mode=args.es_mode, min_delta=args.loss_delta, patience=args.patience) epoch = 0 if args.resume_epoch != 0: load_weights([elisa_net], solver, args.resume_epoch, args) epoch = args.resume_epoch solver = lr_resume(solver, args.lr_resume) print('Loaded weights from epoch {}'.format(args.resume_epoch)) while epoch < args.epochs and not args.eval: epoch += 1
def reproduce(n_epochs=457, batch_size=64, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from torch.utils import data from torchvision import datasets from torchvision import transforms from pytorch_generative import trainer from pytorch_generative import models transform = transforms.Compose([ transforms.ToTensor(), lambda x: distributions.Bernoulli(probs=x).sample() ]) train_loader = debug_loader or data.DataLoader( datasets.MNIST( "/tmp/data", train=True, download=True, transform=transform), batch_size=batch_size, shuffle=True, num_workers=8, ) test_loader = debug_loader or data.DataLoader( datasets.MNIST( "/tmp/data", train=False, download=True, transform=transform), batch_size=batch_size, num_workers=8, ) model = models.ImageGPT( in_channels=1, out_channels=1, in_size=28, n_transformer_blocks=8, n_attention_heads=2, n_embedding_channels=64, ) optimizer = optim.Adam(model.parameters(), lr=5e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) bce_loss_fn = nn.BCEWithLogitsLoss(reduction="none") def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) loss = F.binary_cross_entropy_with_logits(preds, x, reduction="none") return loss.sum(dim=1).mean() model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, device=device, ) model_trainer.interleaved_train_and_eval(n_epochs)
def reproduce(n_epochs=457, batch_size=128, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from torch.utils import data from torchvision import datasets from torchvision import transforms from pytorch_generative import trainer from pytorch_generative import models transform = transforms.ToTensor() train_loader = debug_loader or data.DataLoader( datasets.MNIST( "/tmp/data", train=True, download=True, transform=transform), batch_size=batch_size, shuffle=True, num_workers=8, ) test_loader = debug_loader or data.DataLoader( datasets.MNIST( "/tmp/data", train=False, download=True, transform=transform), batch_size=batch_size, num_workers=8, ) model = models.VAE( in_channels=1, out_channels=1, in_size=28, latent_dim=10, hidden_channels=32, n_residual_blocks=2, residual_channels=16, ) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): preds, vae_loss = preds recon_loss = F.binary_cross_entropy_with_logits(preds, x) loss = recon_loss * 100 + vae_loss return { "recon_loss": recon_loss, "vae_loss": vae_loss, "loss": loss, } def sample_fn(model): return torch.sigmoid(model.sample(n_images=64)) model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, sample_epochs=5, sample_fn=sample_fn, log_dir=log_dir, device=device, ) model_trainer.interleaved_train_and_eval(n_epochs)
def __init__( self, model=None, device=None, hparams=dict(), name='', ): # reproducibility torch.manual_seed(config.seed) np.random.seed(config.seed) torch.set_default_tensor_type('torch.FloatTensor') context = dict() context['hparams'] = hparams context['max_epoch'] = hparams.get('max_epoch', config.max_epoch) context['normal_classes'] = hparams.get('normal_classes', config.normal_classes) # acquiring device cuda if available context[constants.DEVICE] = device or torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device:", context[constants.DEVICE]) transform_train = transforms.Compose([ transforms.RandomResizedCrop(size=32, scale=(0.2, 1.)), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomGrayscale(p=0.2), # transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) print('loading training data') context['train_data'] = DatasetSelection( hparams.get('dataset', config.dataset), classes=context['normal_classes'], train=True, return_indexes=True, transform=transform_train) print('loading test data') context['test_data'] = DatasetSelection(hparams.get('dataset', config.dataset), train=False, transform=transform_test) context['input_shape'] = context['train_data'].input_shape() print('initializing data loaders') context['train_loader'] = context['train_data'].get_dataloader( shuffle=True, batch_size=hparams.get('train_batch_size', config.train_batch_size)) context['test_loader'] = context['test_data'].get_dataloader( shuffle=False, batch_size=hparams.get('test_batch_size', config.test_batch_size)) print('initializing models') context['models'] = model or DeepSVDD( train_data=context['train_data'], latent_size=hparams.get('latent_size', model_config.latent_size), nce_t=hparams.get('nce_t', model_config.nce_t), nce_k=hparams.get('nce_k', model_config.nce_k), nce_m=hparams.get('nce_m', model_config.nce_m), device=device ).to(context[constants.DEVICE]) context["models"].resnet = context["models"].resnet.to(context[constants.DEVICE]) print('initializing center - ', end='') context["models"].init_center( context[constants.DEVICE], init_zero=hparams.get('zero_centered', False)) print(context["models"].center.mean()) checkpoint = hparams.get('checkpoint', config.checkpoint_drmade) if checkpoint: context["models"].load(checkpoint, context[constants.DEVICE]) print(f'models: {context["models"].name} was initialized') base_lr = hparams.get('base_lr', model_config.deepsvdd_sgd_base_lr) lr_decay = hparams.get('lr_decay', model_config.deepsvdd_sgd_lr_decay) lr_schedule = hparams.get('lr_schedule', model_config.deepsvdd_sgd_schedule) sgd_momentum = hparams.get('sgd_momentum', model_config.deepsvdd_sgd_momentum) sgd_weight_decay = hparams.get('sgd_weight_deecay', model_config.deepsvdd_sgd_weight_decay) pgd_eps = hparams.get('pgd/eps', model_config.deepsvdd_pgd_eps) pgd_iterations = hparams.get('pgd/iterations', model_config.deepsvdd_pgd_iterations) pgd_alpha = hparams.get('pgd/alpha', model_config.deepsvdd_pgd_alpha) pgd_randomize = hparams.get('pgd/randomize', model_config.deepsvdd_pgd_randomize) radius_factor = hparams.get('radius_factor', model_config.radius_factor) nce_factor = hparams.get('nce_factor', model_config.nce_factor) print(f'initializing optimizer SGD - base_lr:{base_lr}') optimizer = SGD( context['models'].resnet.parameters(), lr=base_lr, momentum=sgd_momentum, weight_decay=sgd_weight_decay, ) context['optimizers'] = [optimizer] context['optimizer/sgd'] = optimizer print(f'initializing learning rate scheduler - lr_decay:{lr_decay} half_schedule:{lr_schedule}') context['lr_multiplicative_factor_lambda'] = lambda epoch: 0.1 \ if (epoch + 1) % lr_schedule == 0 else lr_decay scheduler = lr_scheduler.MultiplicativeLR( optimizer, lr_lambda=context['lr_multiplicative_factor_lambda'], last_epoch=-1) context['schedulers'] = [scheduler] context['scheduler/sgd'] = scheduler # setting up tensorboard data summerizer context['name'] = name or '{}{}-{}{}{}{}|SGDm{}wd{}-baselr{}-decay{}-0.1schedule{}'.format( hparams.get('dataset', config.dataset).__name__, '{}'.format( '' if not context['normal_classes'] else '[' + ','.join( str(i) for i in hparams.get('normal_classes', config.normal_classes)) + ']' ), context['models'].name, f'|NCE{nce_factor}' if nce_factor else '', f'|Radius{radius_factor}' if radius_factor else '', '' if not pgd_eps else '|pgd-eps{}-iterations{}alpha{}{}'.format( pgd_eps, pgd_iterations, pgd_alpha, 'randomized' if pgd_randomize else '', ), sgd_momentum, sgd_weight_decay, base_lr, lr_decay, lr_schedule, ) super(DeepSVDDTrainer, self).__init__(context['name'], context, ) attacker = PGDAttackAction( Radius('radius'), eps=pgd_eps, iterations=pgd_iterations, randomize=pgd_randomize, alpha=pgd_alpha) train_loop = RobustNCEDeepSVDDLoop( name='train', data_loader=context['train_loader'], device=context[constants.DEVICE], optimizers=('sgd',), attacker=attacker, log_interval=hparams.get('log_interval', config.log_data_feed_loop_interval), ) self.context['loops'] = [train_loop] print('setting up writer') self.setup_writer() print('trainer', context['name'], 'is ready!')
def reproduce(n_epochs=457, batch_size=128, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from pytorch_generative import datasets from pytorch_generative import models from pytorch_generative import trainer train_loader, test_loader = debug_loader, debug_loader if train_loader is None: train_loader, test_loader = datasets.get_mnist_loaders(batch_size) model = models.VAE( in_channels=1, out_channels=1, latent_channels=2, hidden_channels=128, residual_channels=32, ) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): preds, vae_loss = preds recon_loss = F.binary_cross_entropy_with_logits(preds, x, reduction="none") recon_loss = recon_loss.mean(dim=(1, 2, 3)) loss = recon_loss + vae_loss return { "recon_loss": recon_loss.mean(), "vae_loss": vae_loss.mean(), "loss": loss.mean(), } def sample_fn(model): return torch.sigmoid(model.sample(n_samples=16)) model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, sample_epochs=1, sample_fn=sample_fn, log_dir=log_dir, device=device, ) model_trainer.interleaved_train_and_eval(n_epochs)
def __init__(self, hparams: dict = None, name=None, drmade=None, device=None, checkpoint_path=None): if checkpoint_path: self.load_checkpoint(checkpoint_path) return super().__init__(hparams=hparams, name=name, drmade=drmade, device=device) hparams = self.get(constants.HPARAMS_DICT) # pgd encoder made inputs input_limits = self.get('drmade').decoder.output_limits pgd_eps = hparams.get('pgd/eps', model_config.pretrain_encoder_made_pgd_eps) pgd_iterations = hparams.get('pgd/iterations', model_config.pretrain_encoder_made_pgd_iterations) pgd_alpha = hparams.get('pgd/alpha', model_config.pretrain_encoder_made_pgd_alpha) pgd_randomize = hparams.get('pgd/randomize', model_config.pretrain_encoder_made_pgd_randomize) pgd_input = {'eps': pgd_eps, 'iterations': pgd_iterations, 'alpha': pgd_alpha, 'randomize': pgd_randomize, 'input_limits': input_limits} # pgd latent latent_input_limits = self.get('drmade').encoder.output_limits pgd_latent_eps = hparams.get('pgd_latent/eps', model_config.pretrain_made_pgd_eps) pgd_latent_iterations = hparams.get('pgd_latent/iterations', model_config.pretrain_made_pgd_iterations) pgd_latent_alpha = hparams.get('pgd_latent/alpha', model_config.pretrain_made_pgd_alpha) pgd_latent_randomize = hparams.get('pgd_latent/randomize', model_config.pretrain_made_pgd_randomize) pgd_latent = {'eps': pgd_latent_eps, 'iterations': pgd_latent_iterations, 'alpha': pgd_latent_alpha, 'randomize': pgd_latent_randomize, 'input_limits': latent_input_limits} lr_decay = hparams.get('lr_decay', model_config.lr_decay) lr_schedule = hparams.get('lr_schedule', model_config.lr_schedule) # freezing unnecessary model layers print('freezing decoder') for parameter in self.get('drmade').decoder.parameters(): parameter.requires_grad = False print('unfreezing encoder') for parameter in self.get('drmade').encoder.parameters(): parameter.requires_grad = True freeze_encoder = hparams.get('freeze_encoder', False) made_only, freeze_encoder_name = self.get('drmade').encoder.freeze(freeze_encoder) if made_only: freeze_encoder_name = 'freezed' # turning off unnecessary evaluational functions hparams['track_extreme_reconstructions'] = hparams.get('track_extreme_reconstructions', 0) hparams['embedding_interval'] = hparams.get('embedding_interval', 0) hparams['submit_latent_interval'] = hparams.get('submit_latent_interval', 0) hparams['track_jacobian_interval'] = hparams.get('track_jacobian_interval', 0) print('unfreezing made') for parameter in self.get('drmade').made.parameters(): parameter.requires_grad = True # optimizers and schedulers def lr_multiplicative_function(epoch): return 0.5 if lr_schedule and ((epoch + 1) % lr_schedule) == 0 else lr_decay self.set('lr_multiplicative_factor_lambda', lr_multiplicative_function) print(f'initializing learning rate scheduler - lr_decay:{lr_decay} schedule:{lr_schedule}') optimizer = hparams.get('optimizer', Adam) optimizer_hparams = hparams.get('optimizer_hparams', {'lr': model_config.base_lr, }) print(f'initializing optimizer {optimizer.__name__} -', ",".join(f"{i}:{str(j)}" for i, j in optimizer_hparams.items())) made_optimizer = optimizer(self.get('drmade').made.parameters(), **optimizer_hparams) self.add_optimizer('made', made_optimizer) made_scheduler = lr_scheduler.MultiplicativeLR( made_optimizer, lr_lambda=lr_multiplicative_function, last_epoch=-1) self.add_scheduler('made', made_scheduler) if not made_only: encoder_optimizer = optimizer(self.get('drmade').encoder.parameters(), **optimizer_hparams) self.add_optimizer('encoder', encoder_optimizer) encoder_scheduler = lr_scheduler.MultiplicativeLR( encoder_optimizer, lr_lambda=lr_multiplicative_function, last_epoch=-1) self.add_scheduler('encoder', encoder_scheduler) # iterative trainingفهم iterative = hparams.get('iterative', False) assert not iterative or (iterative and not made_only), \ 'cannot perform iterative training with fixed encoder' self.set(constants.TRAINER_NAME, name or '{}-{}{}:{}|{}{}{}{}{}|schedule{}-decay{}'.format( self.get(constants.TRAINER_NAME), self.get('drmade').encoder.name, f'({freeze_encoder_name})' if freeze_encoder_name else '', self.get('drmade').made.name, '' if not pgd_eps else 'pgd-eps{}-iterations{}alpha{}{}|'.format( pgd_eps, pgd_iterations, pgd_alpha, 'randomized' if pgd_randomize else '', ), '' if not pgd_latent_eps else 'pgd-latent-eps{}-iterations{}alpha{}{}|'.format( pgd_latent_eps, pgd_latent_iterations, pgd_latent_alpha, 'randomized' if pgd_randomize else '', ), 'iterative|' if iterative else '', optimizer.__name__, '-{}'.format('-'.join(f"{i}{j}" for i, j in optimizer_hparams.items())), lr_schedule, lr_decay, ), replace=True) print("Trainer: ", self.get(constants.TRAINER_NAME)) self.add_loop(RobustMadeFeedLoop( name='train-made' if iterative else 'train', data_loader=self.get('train_loader'), device=self.get(constants.DEVICE), optimizers=('made',) if iterative or made_only else ('made', 'encoder'), pgd_input=pgd_input, pgd_latent=pgd_latent, log_interval=hparams.get('log_interval', config.log_data_feed_loop_interval))) if iterative: self.add_loop(RobustMadeFeedLoop( name='train-encoder', data_loader=self.get('train_loader'), device=self.get(constants.DEVICE), optimizers=('encoder',), pgd_input=pgd_input, pgd_latent=pgd_latent, log_interval=hparams.get('log_interval', config.log_data_feed_loop_interval))) self.add_loop(RobustMadeFeedLoop( name='validation', data_loader=self.get('validation_loader'), device=self.get(constants.DEVICE), optimizers=tuple(), pgd_input=pgd_input, pgd_latent=pgd_latent, interval=hparams.get('validation_interval', model_config.validation_interval), log_interval=hparams.get('log_interval', config.log_data_feed_loop_interval)))
def multiplicative(optimizer: Optimizer) -> _LRScheduler: return lr_scheduler.MultiplicativeLR( optimizer, lr_lambda=lambda epoch: 0.5) # type: ignore
def reproduce(n_epochs=457, batch_size=128, log_dir="/tmp/run", device="cuda", debug_loader=None): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from pytorch_generative import datasets from pytorch_generative import models from pytorch_generative import trainer train_loader, test_loader = debug_loader, debug_loader if train_loader is None: train_loader, test_loader = datasets.get_cifar10_loaders( batch_size, normalize=True) model = models.VQVAE2( in_channels=3, out_channels=3, hidden_channels=128, residual_channels=64, n_residual_blocks=2, n_embeddings=512, embedding_dim=64, ) optimizer = optim.Adam(model.parameters(), lr=2e-4) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): preds, vq_loss = preds recon_loss = F.mse_loss(preds, x) loss = recon_loss + 0.25 * vq_loss return { "vq_loss": vq_loss, "reconstruction_loss": recon_loss, "loss": loss, } model_trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, device=device, ) model_trainer.interleaved_train_and_eval(n_epochs)
fmnist_train = FashionMNIST(args.save_dir, train=True, transform=train_tfm, download=True) fmnist_test = FashionMNIST(args.save_dir, train=False, transform=test_tfm, download=True) device = 'cuda' if torch.cuda.is_available() else 'cpu' clf = ResNet18(nc=1) clf.to(device) optimizer = optim.SGD(clf.parameters(), lr=args.lr, weight_decay=args.wd, momentum=args.mom) criterion = nn.CrossEntropyLoss() # Multiplies the LR with 0.1 at epoch 100 and 150 as mentioned in the paper lmd = lambda x: 0.1 if x in [100,150] else 1 scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmd) trainloader = DataLoader(fmnist_train, batch_size=args.batch_size, shuffle=True) testloader = DataLoader(fmnist_test, batch_size=args.batch_size, shuffle=False) best_loss = np.inf for epoch in range(args.epochs): t_loss, t_acc = train(epoch, trainloader, clf, criterion, optimizer, scheduler=None, msda=args.msda) print('Epoch {}/{} (train) || Loss: {:.4f} Acc: {:.4f} LR: {(optimizer.param_groups[0]['lr']):.5f}'.format(epoch+1, EPOCHS, t_loss, t_acc, lr)) test_loss, test_acc = test(epoch, testloader, clf, criterion) print('Epoch {}/{} (test) || Loss: {:.4f} Acc: {:.4f}'.format(epoch+1, EPOCHS, test_loss, test_acc))
def train_discriminator(self, config, real_train_set, gen_train_set, real_val_set, gen_val_set): self.disc.train() # Training objects optimizer = networks.optimizers[config["disc_optimizer"]]( self.disc.parameters(), lr=config["disc_lr"]) mult_func = (lambda x: config["disc_lr_decay"]) scheduler = schedulers.MultiplicativeLR(optimizer, lr_lambda=mult_func) real_tab_dataset = TabularDataset(real_train_set) gen_tab_dataset = TabularDataset(gen_train_set) real_loader = torch_data.DataLoader(real_tab_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=constants.LOAD_WORKERS) gen_loader = torch_data.DataLoader(gen_tab_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=constants.LOAD_WORKERS) # Highest divergence (on val.-set) so far best_divergence = None div_save_path = os.path.join(wandb.run.dir, "best_{}_params.pt".format( self.divergence)) # Path to save best params to for epoch_i in range(config["epochs"]): epoch_loss = [] # Train one epoch for (x_real, real_batch), (x_gen, gen_batch) in zip(real_loader, gen_loader): optimizer.zero_grad() # Concat. and move all to correct device real_input = torch.cat((x_real, real_batch), dim=1).to(self.device) gen_input = torch.cat((x_gen, gen_batch), dim=1).to(self.device) real_logits = self.disc(real_input) gen_logits = self.disc(gen_input) loss = self.disc_loss(real_logits, gen_logits) loss.backward() if config["clip_grad"]: nn.utils.clip_grad_norm_(self.disc.parameters(), config["clip_grad"]) optimizer.step() epoch_loss.append(loss.item()) wandb.log({ "{}_epoch_i".format(self.divergence): epoch_i, "{}_train_divergence".format(self.divergence): (-1.) * np.mean(epoch_loss), }) # Evaluate if (epoch_i + 1) % config["val_interval"] == 0: val_divergence = self.compute_divergence( real_val_set, gen_val_set, config["eval_batch_size"]) wandb.log({ "{}_val_divergence".format(self.divergence): val_divergence, }) if (best_divergence == None) or (val_divergence > best_divergence): # Best divergence so far, save parameters model_params = { "disc": self.disc.state_dict(), } torch.save(model_params, div_save_path) best_divergence = val_divergence scheduler.step() # Restore best parameters (early stopping) self.load_params_from_file(div_save_path)
def train(): global model validation_losses = [] train_losses = [] print('starting training') # starting up data loaders print("loading training data") dataset_train = DatasetSelection(train=True, classes=config.normal_classes) print('loading validation data') dataset_validation = DatasetSelection(train=False, classes=config.normal_classes) print('loading test data') dataset_test = DatasetSelection(train=False, classes=config.test_classes) train_sampler = None validation_sampler = None test_sampler = None if config.use_tpu: print('creating tpu sampler') train_sampler = torch.utils.data.distributed.DistributedSampler( dataset_train, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) validation_sampler = torch.utils.data.distributed.DistributedSampler( dataset_validation, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) test_sampler = torch.utils.data.distributed.DistributedSampler( dataset_test, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False ) print('tpu samplers created') train_loader = dataset_train.get_dataloader(sampler=train_sampler, shuffle=not config.use_tpu) validation_loader = dataset_validation.get_dataloader(sampler=validation_sampler, shuffle=not config.use_tpu, ) test_loader = dataset_test.get_dataloader(sampler=test_sampler, shuffle=False, ) input_shape = dataset_validation.input_shape() loss_function = get_loss_function(input_shape) # setting up tensorboard data summerizer writer = SummaryWriter(log_dir=os.path.join(config.log_dir, config.model_name)) # initializing model model = init_model(input_shape) print("initializing optimizer & scheduler") optimizer = Adam(model.parameters(), lr=config.lr) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=config.lr_multiplicative_factor_lambda, last_epoch=config.start_epoch - 1) def train_loop(data_loader, writes=0): if torch.cuda.is_available(): torch.cuda.synchronize(device=config.device) train_loss = 0. last_train_loss = 0. new_writes = 0 time_ = time.time() if config.use_tpu: tracker = xm.RateTracker() model.train() for batch_idx, (input, _) in enumerate(data_loader): input = input.to(config.device, non_blocking=True) if config.noising_factor is not None: false_input = input + config.noising_factor * config.noise_function(input.shape) false_input.clamp_(min=-1, max=1) output = model(false_input) else: output = model(input) loss = loss_function(input, output) optimizer.zero_grad() loss.backward() if config.use_tpu: xm.optimizer_step(optimizer) tracker.add(config.batch_size) else: optimizer.step() train_loss += loss if config.print_every and (batch_idx + 1) % config.print_every == 0 : deno = config.print_every * config.batch_size * np.prod(input_shape) * np.log(2.) if not config.use_tpu: writer.add_scalar('train/bpd', (train_loss / deno), writes + new_writes) print('\t{:3d}/{:3d} - loss : {:.4f}, time : {:.3f}s'.format( batch_idx // config.print_every + 1, len(train_loader) // config.print_every, (train_loss / deno), (time.time() - time_) )) last_train_loss = train_loss train_loss = 0. new_writes += 1 time_ = time.time() del input, _, loss, output return new_writes, (last_train_loss / deno) def validation_loop(data_loader, writes=0): if torch.cuda.is_available(): torch.cuda.synchronize(device=config.device) model.eval() test_loss = 0. with torch.no_grad(): for batch_idx, (input, _) in enumerate(data_loader): input = input.to(config.device, non_blocking=True) output = model(input) loss = loss_function(input, output) test_loss += loss del loss, output deno = batch_idx * config.batch_size * np.prod(input_shape) * np.log(2.) writer.add_scalar('validation/bpd', (test_loss / deno), writes) print('\t{}epoch {:4} validation loss : {:.4f}'.format( '' if not config.use_tpu else xm.get_ordinal(), epoch, (test_loss / deno) ), flush=True ) if config.save_interval and (epoch + 1) % config.save_interval == 0: torch.save(model.state_dict(), config.models_dir + '/{}_{}.pth'.format(config.model_name, epoch)) print('\tsampling epoch {:4}'.format( epoch )) sample_t = sample(model, input_shape) sample_t = rescaling_inv(sample_t) utils.save_image(sample_t, config.samples_dir + '/{}_{}.png'.format(config.model_name, epoch), nrow=5, padding=0) return test_loss / deno try: writes = 0 for epoch in range(config.start_epoch, config.max_epochs): print('epoch {:4} - lr: {}'.format(epoch, optimizer.param_groups[0]["lr"])) if config.use_tpu: para_loader = pl.ParallelLoader(train_loader, [config.device]) train_loop(para_loader.per_device_loader(config.device), writes) xm.master_print("\tFinished training epoch {}".format(epoch)) else: new_writes, train_loss = train_loop(train_loader, writes) train_losses.append(train_loss) writes += new_writes # learning rate schedule scheduler.step(epoch) if config.use_tpu: para_loader = pl.ParallelLoader(validation_loader, [config.device]) validation_loop(para_loader.per_device_loader(config.device), writes) else: validation_loss = validation_loop(validation_loader, writes) validation_losses.append(validation_loss) model_name = f'{"DCNNpp" if config.noising_factor is not None else "PCNNpp"}-E{epoch}' # evaluation and loss tracking if config.plot_every and (epoch + 1) % config.plot_every == 0: plot_loss( train_losses, validation_losses, model_name=f'{"DCNNpp" if config.noising_factor is not None else "PCNNpp"}-{optimizer.param_groups[0]["lr"]:.7f}' , save_path=config.losses_dir + f'/Losses{model_name}.png', ) if config.evaluate_every and (epoch + 1) % config.evaluate_every == 0: eval_data = evaluate(model, dataset_test, test_loader) plot_evaluation( eval_data, model_name=f'{"DCNNpp" if config.noising_factor is not None else "PCNNpp"}-E{epoch}', save_path=config.evaluation_dir + f'/EvalPlot{model_name}.png' ) show_extreme_cases( eval_data, model_name=model_name, save_dir=config.extreme_cases_dir ) writes += 1 except KeyboardInterrupt: pass return model, train_losses, validation_losses
def reproduce(n_epochs=457, batch_size=128, log_dir="/tmp/run", device="cuda", n_channels=1, n_pixel_snail_blocks=1, n_residual_blocks=1, attention_value_channels=1, attention_key_channels=1, evalFlag=False, evaldir="/tmp/run", sampling_part=1): """Training script with defaults to reproduce results. The code inside this function is self contained and can be used as a top level training script, e.g. by copy/pasting it into a Jupyter notebook. Args: n_epochs: Number of epochs to train for. batch_size: Batch size to use for training and evaluation. log_dir: Directory where to log trainer state and TensorBoard summaries. device: Device to train on (either 'cuda' or 'cpu'). debug_loader: Debug DataLoader which replaces the default training and evaluation loaders if not 'None'. Do not use unless you're writing unit tests. """ from torch import optim from torch.nn import functional as F from torch.optim import lr_scheduler from torch.utils import data from torchvision import datasets from torchvision import transforms from pytorch_generative import trainer from pytorch_generative import models #################################################################################################################### #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~EB~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Load ImageGPT Data : import gmpm train = gmpm.train test = gmpm.test train_loader = data.DataLoader( data.TensorDataset(torch.Tensor(train), torch.rand(len(train))), batch_size=batch_size, shuffle=True, num_workers=8, ) test_loader = data.DataLoader( data.TensorDataset(torch.Tensor(test), torch.rand(len(test))), batch_size=batch_size, num_workers=8, ) attention_value_channels = attention_value_channels attention_key_channels = attention_key_channels model = models.PixelSNAIL( #################################################################################################################### # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~EB~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Change Input / Output size : # 3 channels - image after clusters mapping function as input to NN : in_channels=3, # 512 channels - each pixel get probability to get value from 0 to 511 out_channels=512, #################################################################################################################### n_channels=n_channels, n_pixel_snail_blocks=n_pixel_snail_blocks, n_residual_blocks=n_residual_blocks, attention_value_channels=attention_value_channels, attention_key_channels=attention_key_channels, ) optimizer = optim.Adam(model.parameters(), lr=1e-4) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: 0.999977) def loss_fn(x, _, preds): #################################################################################################################### # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~EB~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Update loss function to CrossEntropyLoss : x = x.long() criterion = nn.NLLLoss() B, C, D = preds.size() preds_2d = preds.view(B, C, D, -1) x_2d = x.view(B, D, -1) loss = criterion(preds_2d, x_2d.long()) #################################################################################################################### return loss _model = model.to(device) trainer = trainer.Trainer( model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler, log_dir=log_dir, device=device, sample_epochs=5, sample_fn=None, n_channels=n_channels, n_pixel_snail_blocks=n_pixel_snail_blocks, n_residual_blocks=n_residual_blocks, attention_value_channels=attention_value_channels, attention_key_channels=attention_key_channels, evalFlag=evalFlag, evaldir=evaldir, sampling_part=sampling_part) trainer.interleaved_train_and_eval(n_epochs)
def train(self, train_set, config, val_func=None): tab_dataset = TabularDataset(train_set) n_samples = len(tab_dataset) train_loader = torch_data.DataLoader(tab_dataset, batch_size=config["batch_size"], shuffle=self.shuffle, num_workers=constants.LOAD_WORKERS) # Set network to train mode self.net.train() # Keep track of best so far best_ll = None # best validation score best_mae = None best_epoch_i = None best_save_path = os.path.join(wandb.run.dir, constants.BEST_PARAMS_FILE ) # Path to save best params to # Optimizer opt = networks.optimizers[config["optimizer"]]( self.net.parameters(), lr=config["lr"], weight_decay=config["l2_reg"]) mult_func = (lambda x: config["lr_decay"]) scheduler = schedulers.MultiplicativeLR(opt, lr_lambda=mult_func) for epoch_i in range(config["epochs"]): epoch_loss = [] for batch_i, (x_batch, y_batch) in enumerate(train_loader): batch_size = x_batch.shape[0] # Send to correct device x_batch = x_batch.to(config["device"]) y_batch = y_batch.to(config["device"]) opt.zero_grad() # Train network net_inputs = self.process_net_input(x_batch, y_batch=y_batch) logits = self.net(net_inputs) loss = self.loss(logits, y_batch, x_batch=x_batch, batch_i=batch_i) loss.backward() opt.step() # Store loss epoch_loss.append(loss.item()) # Log epoch stats wandb.log({ "epoch": epoch_i, "loss": np.mean(epoch_loss) }) if val_func and ((epoch_i+1) % config["val_interval"] == 0): evaluation_vals = val_func(self, epoch_i) if (best_epoch_i == None) or (evaluation_vals["ll"] > best_ll): best_ll = evaluation_vals["ll"] best_mae = evaluation_vals["mae"] best_epoch_i = epoch_i # Save model parameters for best epoch only model_params = self.net.state_dict() torch.save(model_params, best_save_path) scheduler.step() # Perform possible additional training self.post_training(train_set, config) wandb.run.summary["best_epoch"] = best_epoch_i # Save best epoch index to wandb wandb.run.summary["log_likelihood"] = best_ll wandb.run.summary["mae"] = best_mae # Restore best parameters to model (for future testing etc.) self.load_params_from_file(best_save_path)
def test_MultiplicativeLR(self, debug=True): """ Usage: python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/pypi/torch1_7_0 -d /cache/pypi -t copytree for filename in /cache/pypi/*.whl; do pip install $filename done proj_root=moco-exp python template_lib/modelarts/scripts/copy_tool.py \ -s s3://bucket-7001/ZhouPeng/codes/$proj_root -d /cache/$proj_root -t copytree -b /cache/$proj_root/code.zip cd /cache/$proj_root pip install -r requirements.txt export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export TIME_STR=1 export PYTHONPATH=./exp:./stylegan2-pytorch:./ python -c "from exp.tests.test_styleganv2 import Testing_stylegan2;\ Testing_stylegan2().test_train_ffhq_128()" :return: """ if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = '0' if 'TIME_STR' not in os.environ: os.environ['TIME_STR'] = '0' if utils.is_debugging() else '0' from template_lib.v2.config_cfgnode.argparser import \ (get_command_and_outdir, setup_outdir_and_yaml, get_append_cmd_str, start_cmd_run) tl_opts = ' '.join(sys.argv[sys.argv.index('--tl_opts') + 1:]) if '--tl_opts' in sys.argv else '' print(f'tl_opts:\n {tl_opts}') command, outdir = get_command_and_outdir( self, func_name=sys._getframe().f_code.co_name, file=__file__) argv_str = f""" --tl_config_file none --tl_command none --tl_outdir {outdir} """ args = setup_outdir_and_yaml(argv_str, return_cfg=True) import torch.nn as nn from torch.optim import lr_scheduler from matplotlib import pyplot as plt def plot_lr(scheduler, title='', labels=['base'], nrof_epoch=100): lr_li = [[] for _ in range(len(labels))] epoch_li = list(range(nrof_epoch)) for epoch in epoch_li: scheduler.step() # 调用step()方法,计算和更新optimizer管理的参数基于当前epoch的学习率 lr = scheduler.get_last_lr() # 获取当前epoch的学习率 for i in range(len(labels)): lr_li[i].append(lr[i]) for lr, label in zip(lr_li, labels): plt.plot(epoch_li, lr, label=label) plt.grid() plt.xlabel('epoch') plt.ylabel('lr') plt.title(title) plt.legend() plt.show() ## StepLR 可视化学习率 base = nn.Linear(3, 32) fc = nn.Linear(32, 10) optimizer = SGD( [ { 'params': base.parameters() }, { 'params': fc.parameters(), 'lr': 0.05 } # 对 fc的参数设置不同的学习率 ], lr=0.1, momentum=0.9) lambda_base = lambda epoch: 0.5 if epoch % 10 == 0 else 1 lambda_fc = lambda epoch: 0.8 if epoch % 10 == 0 else 1 scheduler = lr_scheduler.MultiplicativeLR(optimizer, [lambda_base, lambda_fc]) plot_lr(scheduler, title='MultiplicativeLR', labels=['base', 'fc']) pass
N_EPOCHS = 427 IN_CHANNELS = 1 N_CHANNELS = 64 N_PIXEL_SNAIL_BLOCKS = 8 N_RESIDUAL_BLOCKS = 2 ATTENTION_VALUE_CHANNELS = N_CHANNELS // 2 ATTENTION_KEY_CHANNELS = ATTENTION_VALUE_CHANNELS // 8 torch.cuda.empty_cache() model = PixelSNAIL(IN_CHANNELS, N_CHANNELS, N_PIXEL_SNAIL_BLOCKS, N_RESIDUAL_BLOCKS, ATTENTION_KEY_CHANNELS, ATTENTION_VALUE_CHANNELS).to(torch.device("cuda")) optimizer = optim.Adam(model.parameters(), lr=1e-3) scheduler = lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda _: .999977) bce_loss_fn = nn.BCELoss(reduction='none') def loss_fn(x, _, preds): batch_size = x.shape[0] x, preds = x.view((batch_size, -1)), preds.view((batch_size, -1)) return bce_loss_fn(preds, x).sum(dim=1).mean() trainer = pg.trainer.Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer, train_loader=train_loader, eval_loader=test_loader, lr_scheduler=scheduler,
def __init__( self, hparams=None, name=None, model=None, device=None, ): super(RobustAutoEncoderPreTrainer, self).__init__( hparams, name, model, device, ) hparams = self.get(constants.HPARAMS_DICT) # pgd encoder decoder inputs input_limits = self.get('drmade').decoder.output_limits pgd_eps = hparams.get('pgd/eps', model_config.pretrain_ae_pgd_eps) pgd_iterations = hparams.get('pgd/iterations', model_config.pretrain_ae_pgd_iterations) pgd_alpha = hparams.get('pgd/alpha', model_config.pretrain_ae_pgd_alpha) pgd_randomize = hparams.get('pgd/randomize', model_config.pretrain_ae_pgd_randomize) pgd_input = { 'eps': pgd_eps, 'iterations': pgd_iterations, 'alpha': pgd_alpha, 'randomize': pgd_randomize, 'input_limits': input_limits } # pgd decoder inputs latent_input_limits = self.get('drmade').encoder.output_limits pgd_latent_eps = hparams.get('pgd_latent/eps', model_config.pretrain_ae_latent_pgd_eps) pgd_latent_iterations = hparams.get( 'pgd_latent/iterations', model_config.pretrain_ae_latent_pgd_iterations) pgd_latent_alpha = hparams.get( 'pgd_latent/alpha', model_config.pretrain_ae_latent_pgd_alpha) pgd_latent_randomize = hparams.get( 'pgd_latent/randomize', model_config.pretrain_ae_latent_pgd_randomize) pgd_latent = { 'eps': pgd_latent_eps, 'iterations': pgd_latent_iterations, 'alpha': pgd_latent_alpha, 'randomize': pgd_latent_randomize, 'input_limits': latent_input_limits } base_lr = hparams.get('base_lr', model_config.base_lr) lr_decay = hparams.get('lr_decay', model_config.lr_decay) lr_schedule = hparams.get('lr_schedule', model_config.lr_schedule) print('freezing made') for parameter in self.get('drmade').made.parameters(): parameter.requires_grad = False print('unfreezing encoder & decoder') for parameter in self.get('drmade').encoder.parameters(): parameter.requires_grad = True for parameter in self.get('drmade').decoder.parameters(): parameter.requires_grad = True print(f'initializing optimizer Adam - base_lr:{base_lr}') optimizer = Adam([{ 'params': self.get('drmade').encoder.parameters() }, { 'params': self.get('drmade').decoder.parameters() }], lr=base_lr) self.add_optimizer('ae', optimizer) print( f'initializing learning rate scheduler - lr_decay:{lr_decay} schedule:{lr_schedule}' ) self.set( 'lr_multiplicative_factor_lambda', lambda epoch: 0.5 if (epoch + 1) % lr_schedule == 0 else lr_decay) self.add_scheduler( 'ae', lr_scheduler.MultiplicativeLR( optimizer, lr_lambda=self.get('lr_multiplicative_factor_lambda'), last_epoch=-1)) self.set( constants.TRAINER_NAME, name or 'PreTrain-{}-{}:{}|{}{}Adam-lr{}-schedule{}-decay{}'.format( self.get(constants.TRAINER_NAME), self.get('drmade').encoder.name, self.get('drmade').decoder.name, '' if not pgd_eps else 'pgd-eps{}-iterations{}alpha{}{}|'.format( pgd_eps, pgd_iterations, pgd_alpha, 'randomized' if pgd_randomize else '', ), '' if not pgd_latent_eps else 'pgd-latent-eps{}-iterations{}alpha{}{}|'.format( pgd_latent_eps, pgd_latent_iterations, pgd_latent_alpha, 'randomized' if pgd_latent_randomize else '', ), base_lr, lr_schedule, lr_decay), replace=True) print("Pre Trainer: ", self.get(constants.TRAINER_NAME)) self.add_loop( RobustAEFeedLoop(name='train', data_loader=self.context['train_loader'], device=self.context[constants.DEVICE], optimizers=('ae', ), pgd_input=pgd_input, pgd_latent=pgd_latent, log_interval=hparams.get( 'log_interval', config.log_data_feed_loop_interval))) self.add_loop( RobustAEFeedLoop( name='validation', data_loader=self.context['validation_loader'], device=self.context[constants.DEVICE], optimizers=tuple(), pgd_input=pgd_input, pgd_latent=pgd_latent, interval=hparams.get('validation_interval', model_config.validation_interval), log_interval=hparams.get('log_interval', config.log_data_feed_loop_interval))) self.setup_writer()
def train(self, train_set, config, val_func=None): tab_dataset = TabularDataset(train_set) n_samples = len(tab_dataset) train_loader = torch_data.DataLoader( tab_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=constants.LOAD_WORKERS) # Set models to train mode self.gen.train() self.disc.train() # Keep track of best so far best_ll = None # best validation log-likelihood best_mae = None # MAE where LL is best (not necessarily best MAE) best_epoch_i = None best_save_path = os.path.join( wandb.run.dir, constants.BEST_PARAMS_FILE) # Path to save best params to # Optimizers (see GAN-hacks) gen_opt = networks.optimizers[config["gen_optimizer"]]( self.gen.parameters(), lr=config["gen_lr"]) disc_opt = networks.optimizers[config["disc_optimizer"]]( self.disc.parameters(), lr=config["disc_lr"]) # returns multiplicative factor, not new learning rate gen_mult_func = (lambda x: config["gen_lr_decay"]) disc_mult_func = (lambda x: config["disc_lr_decay"]) gen_scheduler = schedulers.MultiplicativeLR(gen_opt, lr_lambda=gen_mult_func) disc_scheduler = schedulers.MultiplicativeLR(disc_opt, lr_lambda=disc_mult_func) for epoch_i in range(config["epochs"]): epoch_disc_loss = [] epoch_gen_loss = [] epoch_fooling = [] for batch_i, (x_batch, data_batch) in enumerate(train_loader): batch_size = data_batch.shape[0] # Send to correct device x_batch = x_batch.to(config["device"]) data_batch = data_batch.to(config["device"]) disc_opt.zero_grad() # Sample noise noise_batch = self.noise_dist.sample([batch_size ]).to(config["device"]) # Sample from generator gen_input = torch.cat((x_batch, noise_batch), dim=1) gen_batch = self.gen(gen_input) # Train discriminator data_logits = self.disc(torch.cat((x_batch, data_batch), dim=1)) gen_logits = self.disc(torch.cat((x_batch, gen_batch), dim=1)) disc_loss = self.disc_loss(data_logits, gen_logits) disc_loss.backward() if config["clip_grad"]: nn.utils.clip_grad_norm_(self.disc.parameters(), config["clip_grad"]) disc_opt.step() gen_opt.zero_grad() # Train generator ("new_" here just means part of G training steps) n_gen_samples = batch_size * config["gen_samples"] new_noise_batch = self.noise_dist.sample([n_gen_samples]).to( config["device"]) if config["gen_samples"] > 1: # Repeat each x sample an amount of times # to get multiple generator samples for it x_batch_repeated = torch.repeat_interleave( x_batch, config["gen_samples"], dim=0) else: x_batch_repeated = x_batch new_gen_input = torch.cat((x_batch_repeated, new_noise_batch), dim=1) new_gen_batch = self.gen(new_gen_input) new_gen_logits = self.disc( torch.cat((x_batch_repeated, new_gen_batch), dim=1)) gen_loss = self.gen_loss(new_gen_logits) gen_loss.backward() if config["clip_grad"]: nn.utils.clip_grad_norm_(self.gen.parameters(), config["clip_grad"]) gen_opt.step() # Store loss batch_fooling = torch.mean(torch.sigmoid(new_gen_logits)) epoch_fooling.append(batch_fooling.item()) epoch_disc_loss.append(disc_loss.item()) epoch_gen_loss.append(gen_loss.item()) # Log stats for epoch wandb.log({ "epoch": epoch_i, "discriminator_loss": np.mean(epoch_disc_loss), "generator_loss": np.mean(epoch_gen_loss), "fooling": np.mean(epoch_fooling), }) if val_func and ((epoch_i + 1) % config["val_interval"] == 0): # Validate evaluation_vals = val_func(self, epoch_i) if (best_epoch_i == None) or (evaluation_vals["ll"] > best_ll): best_ll = evaluation_vals["ll"] best_mae = evaluation_vals["mae"] best_epoch_i = epoch_i # Save model parameters for best epoch only model_params = { "gen": self.gen.state_dict(), "disc": self.disc.state_dict(), } torch.save(model_params, best_save_path) gen_scheduler.step() disc_scheduler.step() wandb.run.summary[ "best_epoch"] = best_epoch_i # Save best epoch index to wandb wandb.run.summary["log_likelihood"] = best_ll wandb.run.summary["mae"] = best_mae # Restore best parameters to model (for future testing etc.) self.load_params_from_file(best_save_path)