def __init__(self, image_size, latent_dim = 512, style_depth = 8, network_capacity = 16, transparent = False, fp16 = False, cl_reg = False, steps = 1, lr = 1e-4, fq_layers = [], fq_dict_size = 256, attn_layers = []): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.G = Generator(image_size, latent_dim, network_capacity, transparent = transparent) self.D = Discriminator(image_size, network_capacity, fq_layers = fq_layers, fq_dict_size = fq_dict_size, attn_layers = attn_layers, transparent = transparent) self.SE = StyleVectorizer(latent_dim, style_depth) self.GE = Generator(image_size, latent_dim, network_capacity, transparent = transparent) # experimental contrastive loss discriminator regularization self.D_cl = ContrastiveLearner(self.D, image_size, hidden_layer='flatten') if cl_reg else None set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list(self.S.parameters()) self.G_opt = DiffGrad(generator_params, lr = self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr = self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging() self.cuda() if fp16: (self.S, self.G, self.D, self.SE, self.GE), (self.G_opt, self.D_opt) = amp.initialize([self.S, self.G, self.D, self.SE, self.GE], [self.G_opt, self.D_opt], opt_level='O2')
def __init__(self, image_size, latent_dim=512, style_depth=8, network_capacity=16, steps=1, lr=1e-4): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.99) self.S = StyleVectorizer(latent_dim, style_depth) self.G = Generator(image_size, latent_dim, network_capacity) self.D = Discriminator(image_size, network_capacity) self.SE = StyleVectorizer(latent_dim, style_depth) self.GE = Generator(image_size, latent_dim, network_capacity) set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list( self.S.parameters()) self.G_opt = DiffGrad(generator_params, lr=self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging()
def __init__(self, image_size, label_dim, latent_dim=LATENT_DIM, style_depth=STYLE_DEPTH, network_capacity=NETWORK_CAPACITY, steps=1, lr=LEARNING_RATE, channels=CHANNELS, condition_on_mapper=CONDITION_ON_MAPPER, use_biases=USE_BIASES, label_epsilon=LABEL_EPSILON): super().__init__() self.condition_on_mapper = condition_on_mapper self.lr = lr self.steps = steps self.ema_updater = EMA(0.99) self.S = StyleVectorizer(latent_dim, label_dim, style_depth, condition_on_mapper=self.condition_on_mapper, use_biases=use_biases) self.G = Generator(image_size, latent_dim, label_dim, network_capacity, channels=channels, condition_on_mapper=self.condition_on_mapper, use_biases=use_biases) self.D = Discriminator(image_size, label_dim, network_capacity=network_capacity, channels=channels, label_epsilon=label_epsilon) self.SE = StyleVectorizer(latent_dim, label_dim, style_depth, condition_on_mapper=self.condition_on_mapper, use_biases=use_biases) self.GE = Generator(image_size, latent_dim, label_dim, network_capacity, channels=channels, condition_on_mapper=self.condition_on_mapper, use_biases=use_biases) set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list(self.S.parameters()) self.G_opt = DiffGrad(generator_params, lr=self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) self.use_biases = use_biases self._init_weights() self.reset_parameter_averaging()
def __init__(self, image_size, latent_dim = 512, style_depth = 8, network_capacity = 16, transparent = False, fp16 = False, steps = 1, lr = 1e-4): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.G = Generator(image_size, latent_dim, network_capacity, transparent = transparent) self.D = Discriminator(image_size, network_capacity, transparent = transparent) self.SE = StyleVectorizer(latent_dim, style_depth) self.GE = Generator(image_size, latent_dim, network_capacity, transparent = transparent) set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list(self.S.parameters()) self.G_opt = DiffGrad(generator_params, lr = self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr = self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging() self.cuda() if fp16: (self.S, self.G, self.D, self.SE, self.GE), (self.G_opt, self.D_opt) = amp.initialize([self.S, self.G, self.D, self.SE, self.GE], [self.G_opt, self.D_opt], opt_level='O2')
def __init__(self, image_size, latent_dim=512, noise_dim=100, style_depth=8, network_capacity=16, transparent=False, steps=1, lr=2e-4): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.N = NoiseVectorizer(noise_dim) self.G = Generator(image_size, latent_dim, network_capacity, transparent=transparent) self.D = Discriminator(image_size, network_capacity, transparent=transparent) ########################################### self.E = ExtractNetSimilarE(64) self.SE = StyleVectorizer(latent_dim, style_depth) self.NE = NoiseVectorizer(noise_dim) self.GE = Generator(image_size, latent_dim, network_capacity, transparent=transparent) set_requires_grad(self.SE, False) set_requires_grad(self.NE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list( self.S.parameters()) + list(self.N.parameters()) self.G_opt = DiffGrad(generator_params, lr=self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) ############################################### E_params = list(self.E.parameters()) self.E_opt = DiffGrad(E_params, lr=self.lr, betas=(0.5, 0.9)) N_params = list(self.N.parameters()) self.N_opt = DiffGrad(N_params, lr=self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging()
def __init__(self, image_size, latent_dim=512, style_depth=8, network_capacity=16, transparent=False, fp16=False, cl_reg=False, steps=1, lr=1e-4, fq_layers=[], fq_dict_size=256, freeze_g=False, freeze_d=False): super().__init__() self.lr = lr self.steps = steps self.ema_decay = 0.995 self.G = Generator(image_size, latent_dim, network_capacity, transparent=transparent) self.D = Discriminator(image_size, network_capacity, fq_layers=fq_layers, fq_dict_size=fq_dict_size, transparent=transparent) self.GE = Generator(image_size, latent_dim, network_capacity, transparent=transparent) set_requires_grad(self.GE, False) if freeze_g: self.G.freeze_() if freeze_d: self.D.freeze_() self.G.opt = DiffGrad(self.G.parameters(), lr=self.lr, betas=(0.5, 0.9)) self.D.opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging()
def forward(self): if exists(self.start_image): tqdm.write('Preparing with initial image...') optim = DiffGrad(self.model.parameters(), lr=self.start_image_lr) pbar = trange(self.start_image_train_iters, desc='iteration') for _ in pbar: loss = self.model.model(self.start_image) loss.backward() pbar.set_description(f'loss: {loss.item():.2f}') optim.step() optim.zero_grad() if terminate: print('interrupted by keyboard, gracefully exiting') return exit() del self.start_image del optim tqdm.write(f'Imagining "{self.text}" from the depths of my weights...') if self.open_folder: open_folder('./') self.open_folder = False for epoch in trange(self.epochs, desc='epochs'): pbar = trange(self.iterations, desc='iteration') for i in pbar: loss = self.train_step(epoch, i) pbar.set_description(f'loss: {loss.item():.2f}') if terminate: print('interrupted by keyboard, gracefully exiting') return
def forward(self): if exists(self.start_image): tqdm.write('Preparing with initial image...') optim = DiffGrad(self.model.model.parameters(), lr=self.start_image_lr) pbar = trange(self.start_image_train_iters, desc='iteration') try: for _ in pbar: loss = self.model.model(self.start_image) loss.backward() pbar.set_description(f'loss: {loss.item():.2f}') optim.step() optim.zero_grad() except KeyboardInterrupt: print('interrupted by keyboard, gracefully exiting') return exit() del self.start_image del optim tqdm.write( f'Imagining "{self.textpath}" from the depths of my weights...') with torch.no_grad(): self.model( self.clip_encoding, dry_run=True ) # do one warmup step due to potential issue with CLIP and CUDA if self.open_folder: if self.output_folder: open_folder(self.output_folder) else: open_folder('./') self.open_folder = False try: for epoch in trange(self.epochs, desc='epochs'): pbar = trange(self.iterations, desc='iteration') for i in pbar: _, loss = self.train_step(epoch, i) pbar.set_description(f'loss: {loss.item():.2f}') # Update clip_encoding per epoch if we are creating a story if self.create_story: self.clip_encoding = self.update_story_encoding(epoch, i) except KeyboardInterrupt: print('interrupted by keyboard, gracefully exiting') return self.save_image(epoch, i) # one final save at end if (self.save_gif or self.save_video) and self.save_progress: self.generate_gif()
def forward(self): if exists(self.start_image): tqdm.write('Preparing with initial image...') optim = DiffGrad(self.model.parameters(), lr = self.start_image_lr) pbar = trange(self.start_image_train_iters, desc='iteration') for _ in pbar: loss = self.model.model(self.start_image) loss.backward() pbar.set_description(f'loss: {loss.item():.2f}') optim.step() optim.zero_grad() if terminate: print('interrupted by keyboard, gracefully exiting') return sys.exit() del self.start_image del optim tqdm.write(f'Imagining "{self.textpath}" from the depths of my weights...') self.model(self.clip_encoding, dry_run = True) # do one warmup step due to potential issue with CLIP and CUDA if self.open_folder: open_folder('./') self.open_folder = False for epoch in trange(self.epochs, desc='epochs'): pbar = trange(self.iterations, desc='iteration') for i in pbar: loss = self.train_step(epoch, i) pbar.set_description(f'loss: {loss.item():.2f}') if terminate: print('interrupted by keyboard, gracefully exiting') return self.save_image(self.epochs, self.iterations) # one final save at end
def __init__( self, *, text=None, img=None, clip_encoding=None, lr=1e-5, batch_size=4, gradient_accumulate_every=4, save_every=100, image_width=512, num_layers=16, epochs=20, iterations=1050, save_progress=True, seed=None, open_folder=True, save_date_time=False, start_image_path=None, start_image_train_iters=10, start_image_lr=3e-4, theta_initial=None, theta_hidden=None, model_name="ViT-B/32", lower_bound_cutout=0.1, # should be smaller than 0.8 upper_bound_cutout=1.0, saturate_bound=False, averaging_weight=0.3, create_story=False, story_start_words=5, story_words_per_epoch=5, story_separator=None, gauss_sampling=False, gauss_mean=0.6, gauss_std=0.2, do_cutout=True, center_bias=False, center_focus=2, optimizer="AdamP", jit=True, hidden_size=256, save_gif=False, save_video=False, ): super().__init__() if exists(seed): tqdm.write(f'setting seed: {seed}') torch.manual_seed(seed) torch.cuda.manual_seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True # fields for story creation: self.create_story = create_story self.words = None self.separator = str( story_separator) if story_separator is not None else None if self.separator is not None and text is not None: #exit if text is just the separator if str(text).replace(' ', '').replace(self.separator, '') == '': print( 'Exiting because the text only consists of the separator! Needs words or phrases that are separated by the separator.' ) exit() #adds a space to each separator and removes double spaces that might be generated text = text.replace(self.separator, self.separator + ' ').replace(' ', ' ').strip() self.all_words = text.split(" ") if text is not None else None self.num_start_words = story_start_words self.words_per_epoch = story_words_per_epoch if create_story: assert text is not None, "We need text input to create a story..." # overwrite epochs to match story length num_words = len(self.all_words) self.epochs = 1 + (num_words - self.num_start_words) / self.words_per_epoch # add one epoch if not divisible self.epochs = int(self.epochs) if int( self.epochs) == self.epochs else int(self.epochs) + 1 if self.separator is not None: if self.separator not in text: print("Separator '" + self.separator + "' will be ignored since not in text!") self.separator = None else: self.epochs = len( list(filter(None, text.split(self.separator)))) print( "Running for", self.epochs, "epochs" + (" (split with '" + self.separator + "' as the separator)" if self.separator is not None else "")) else: self.epochs = epochs # jit models only compatible with version 1.7.1 if "1.7.1" not in torch.__version__: if jit == True: print( "Setting jit to False because torch version is not 1.7.1.") jit = False # Load CLIP self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") clip_perceptor, norm = load(model_name, jit=jit, device=self.device) self.perceptor = clip_perceptor.eval() for param in self.perceptor.parameters(): param.requires_grad = False if jit == False: input_res = clip_perceptor.visual.input_resolution else: input_res = clip_perceptor.input_resolution.item() self.clip_transform = create_clip_img_transform(input_res) self.iterations = iterations self.image_width = image_width total_batches = self.epochs * self.iterations * batch_size * gradient_accumulate_every model = DeepDaze( self.perceptor, norm, input_res, total_batches, batch_size=batch_size, image_width=image_width, num_layers=num_layers, theta_initial=theta_initial, theta_hidden=theta_hidden, lower_bound_cutout=lower_bound_cutout, upper_bound_cutout=upper_bound_cutout, saturate_bound=saturate_bound, gauss_sampling=gauss_sampling, gauss_mean=gauss_mean, gauss_std=gauss_std, do_cutout=do_cutout, center_bias=center_bias, center_focus=center_focus, hidden_size=hidden_size, averaging_weight=averaging_weight, ).to(self.device) self.model = model self.scaler = GradScaler() siren_params = model.model.parameters() if optimizer == "AdamP": self.optimizer = AdamP(siren_params, lr) elif optimizer == "Adam": self.optimizer = torch.optim.Adam(siren_params, lr) elif optimizer == "DiffGrad": self.optimizer = DiffGrad(siren_params, lr) self.gradient_accumulate_every = gradient_accumulate_every self.save_every = save_every self.save_date_time = save_date_time self.open_folder = open_folder self.save_progress = save_progress self.text = text self.image = img self.textpath = create_text_path(self.perceptor.context_length, text=text, img=img, encoding=clip_encoding, separator=story_separator) self.filename = self.image_output_path() # create coding to optimize for self.clip_encoding = self.create_clip_encoding(text=text, img=img, encoding=clip_encoding) self.start_image = None self.start_image_train_iters = start_image_train_iters self.start_image_lr = start_image_lr if exists(start_image_path): file = Path(start_image_path) assert file.exists( ), f'file does not exist at given starting image path {self.start_image_path}' image = Image.open(str(file)) start_img_transform = T.Compose([ T.Resize(image_width), T.CenterCrop((image_width, image_width)), T.ToTensor() ]) image_tensor = start_img_transform(image).unsqueeze(0).to( self.device) self.start_image = image_tensor self.save_gif = save_gif self.save_video = save_video
def __init__( self, *, text=None, # 文本 img=None, # 想象的艺术图片 lr=1e-5, # 学习率 batch_size=4, # gradient_accumulate_every=4, # 梯度累积,增大可以在比较小的epoch上快速降低loss save_every=100, # 每迭代100次就保存一次 image_width=200, # 最大400,相应的layer最大14 num_layers=8, epochs=3, iterations=1050, save_progress=True, open_folder=True, theta_initial=None, # 描述siren初始层的色彩空间 theta_hidden=None, # 描述siren隐藏层的色彩空间 model_name="ViT-B/32", # 模型名称 VIT-B 小模型 lower_bound_cutout=0.1, # should be smaller than 0.8 upper_bound_cutout=1.0, averaging_weight=0.3, do_cutout=True, center_bias=False, center_focus=2, optimizer="AdamP", jit=True, hidden_size=256, save_gif=True, save_video=True, ): super().__init__() self.epochs = epochs # jit models only compatible with version 1.7.1 if "1.7.1" not in torch.__version__: if jit: print("Setting jit to False because torch version is not 1.7.1.") jit = False # 加载CLIP模型 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 如果gpu可用则选择,否则cpu # 在线下载神经网络ViT-B/32模型,供clip使用 # 返回clip模型,nn.Module # Torchvision转换,将PIL图像转换为张量,返回的模型可以将其用作输入 clip_perceptor, norm = load(model_name, jit=jit, device=self.device) # 不启用 Batch Normalization 和 Dropout。 # 生成的模型model要用来测试样本。在model(test)之前,需要加上model.eval(),否则的话,有输入数据,即使不训练,它也会改变权值。 self.perceptor = clip_perceptor.eval() for param in self.perceptor.parameters(): param.requires_grad = False if not jit: input_res = clip_perceptor.visual.input_resolution # 输入分辨率 else: input_res = clip_perceptor.input_resolution.item() # 创建clip图像transform模型 self.clip_transform = create_clip_img_transform(input_res) # 迭代次数 self.iterations = iterations # 生成图像的宽度 self.image_width = image_width # 总共的批大小 = imagine输入的epochs(20) * 迭代次数 * batch的大小 * 梯度累积 total_batches = self.epochs * self.iterations * batch_size * gradient_accumulate_every # 加载DeepDaze模型,并将该模型部署到gpu或cpu上 model = ShallowDaze( self.perceptor, # clip模型 norm, # clip模型范数 input_res, # 输入分辨率 total_batches, batch_size=batch_size, # batch_size=4 image_width=image_width, num_layers=num_layers, theta_initial=theta_initial, # None theta_hidden=theta_hidden, # None lower_bound_cutout=lower_bound_cutout, # 0.1 upper_bound_cutout=upper_bound_cutout, # 1.0 do_cutout=do_cutout, center_bias=center_bias, center_focus=center_focus, hidden_size=hidden_size, averaging_weight=averaging_weight, ).to(self.device) self.model = model # deep-daze模型 self.scaler = GradScaler() # 通过放大loss的值来防止梯度的下溢 siren_params = model.model.parameters() # 三种梯度下降的方法,默认AdamP if optimizer == "AdamP": self.optimizer = AdamP(siren_params, lr) elif optimizer == "Adam": self.optimizer = torch.optim.Adam(siren_params, lr) elif optimizer == "DiffGrad": self.optimizer = DiffGrad(siren_params, lr) # 梯度累积 self.gradient_accumulate_every = gradient_accumulate_every self.save_every = save_every self.open_folder = open_folder self.save_progress = save_progress self.text = text self.image = img # 默认clip_encoding=None self.textpath = create_text_path(self.perceptor.context_length, text=text, img=img) self.filename = self.image_output_path() # 创建代码以进行优化 self.clip_encoding = self.create_clip_encoding(text=text, img=img) # 默认clip_encoding=None self.save_gif = save_gif self.save_video = save_video
def __init__( self, image_size, latent_dim=512, style_depth=8, network_capacity=16, transparent=False, fp16=False, cl_reg=False, augment_fn=None, steps=1, lr=1e-4, fq_layers=[], fq_dict_size=256, attn_layers=[], ): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.G = Generator(image_size, latent_dim, network_capacity, transparent=transparent, attn_layers=attn_layers) self.D = Discriminator( image_size, network_capacity, fq_layers=fq_layers, fq_dict_size=fq_dict_size, attn_layers=attn_layers, transparent=transparent, ) self.SE = StyleVectorizer(latent_dim, style_depth) self.GE = Generator(image_size, latent_dim, network_capacity, transparent=transparent, attn_layers=attn_layers) set_requires_grad(self.SE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) + list( self.S.parameters()) self.G_opt = DiffGrad(generator_params, lr=self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging() self.cuda() if fp16: (self.S, self.G, self.D, self.SE, self.GE), (self.G_opt, self.D_opt) = amp.initialize( [self.S, self.G, self.D, self.SE, self.GE], [self.G_opt, self.D_opt], opt_level="O2") # experimental contrastive loss discriminator regularization if augment_fn is not None: self.augment_fn = augment_fn else: self.augment_fn = nn.Sequential( nn.ReflectionPad2d(int((sqrt(2) - 1) * image_size / 4)), RandomApply(augs.ColorJitter(0.8, 0.8, 0.8, 0.2), p=0.7), augs.RandomGrayscale(p=0.2), augs.RandomHorizontalFlip(), RandomApply(augs.RandomAffine(degrees=0, translate=(0.25, 0.25), shear=(15, 15)), p=0.3), RandomApply(nn.Sequential( augs.RandomRotation(180), augs.CenterCrop(size=(image_size, image_size))), p=0.2), augs.RandomResizedCrop(size=(image_size, image_size)), RandomApply(filters.GaussianBlur2d((3, 3), (1.5, 1.5)), p=0.1), RandomApply(augs.RandomErasing(), p=0.1), ) self.D_cl = (ContrastiveLearner(self.D, image_size, augment_fn=self.augment_fn, fp16=fp16, hidden_layer="flatten") if cl_reg else None)
def __init__(self, image_size, latent_dim=512, noise_dim=100, style_depth=8, network_capacity=16, transparent=False, steps=1, lr=2e-4): super().__init__() self.lr = lr self.steps = steps self.ema_updater = EMA(0.995) self.S = StyleVectorizer(latent_dim, style_depth) self.N = NoiseVectorizer(noise_dim) self.G = Generator(image_size, latent_dim, network_capacity, transparent=transparent) self.D = Discriminator(image_size, network_capacity, transparent=transparent) ########################################### self.E = ExtractNetSimilarE(64) self.SE = StyleVectorizer(latent_dim, style_depth) self.NE = NoiseVectorizer(noise_dim) self.GE = Generator(image_size, latent_dim, network_capacity, transparent=transparent) set_requires_grad(self.SE, False) set_requires_grad(self.NE, False) set_requires_grad(self.GE, False) generator_params = list(self.G.parameters()) self.G_opt = DiffGrad(generator_params, lr=self.lr, betas=(0.5, 0.9)) self.D_opt = DiffGrad(self.D.parameters(), lr=self.lr, betas=(0.5, 0.9)) ############################################### # E_params = list(self.E.parameters())+list(self.G.downsample.parameters()) E_params = list(self.E.parameters()) + list( self.N.parameters()) + list(self.G.parameters()) # E_params = list(self.E.to_logit.parameters()) # base_param_ids = set(map(id, self.E.to_logit.parameters())) # new_params = [p for p in self.E.parameters() if id(p) not in base_param_ids] # E_param_groups = [{'params': self.E.parameters(), 'lr': self.lr}, # # {'params': self.G.parameters(), 'lr': self.lr}, # # {'params': new_params, 'lr': self.lr}, # other E layers # {'params': self.N.parameters(), 'lr': self.lr} # ] self.E_opt = DiffGrad(E_params, lr=self.lr, betas=(0.5, 0.9)) self.E_opt_scheduler = torch.optim.lr_scheduler.StepLR(self.E_opt, step_size=200, gamma=0.1) N_params = list(self.N.parameters()) self.N_opt = DiffGrad(N_params, lr=self.lr, betas=(0.5, 0.9)) self._init_weights() self.reset_parameter_averaging()
def get_optimizer( model: nn.Module, optimizer_name: str, learning_rate: float, weight_decay: float = 1e-5, no_weight_decay_on_bias: bool = False, eps: float = 1e-5, **kwargs, ) -> Optimizer: """ Construct an Optimizer for given model Args: model: Model to optimize. Only parameters that require_grad will be used optimizer_name: Name of the optimizer. Case-insensitive learning_rate: Target learning rate (regardless of the scheduler) weight_decay: Target weight decay no_weight_decay_on_bias: Whether to disable weight decay on bias parameters eps: Default epsilon for Adam-like optimizers. **kwargs: Additional parameters for optimizer Returns: """ from torch.optim import ASGD, SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger # Optimizer parameter groups default_pg, biases_pg = [], [] for k, v in model.named_parameters(): if v.requires_grad: if str.endswith(k, ".bias"): biases_pg.append(v) # biases else: default_pg.append(v) # all else if no_weight_decay_on_bias: parameters = default_pg else: parameters = default_pg + biases_pg optimizer: Optimizer = None if optimizer_name.lower() == "sgd": optimizer = SGD( parameters, lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "asgd": optimizer = ASGD( parameters, lr=learning_rate, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "adam": optimizer = Adam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "rms": optimizer = RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "adamw": optimizer = AdamW( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "radam": optimizer = RAdam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "ranger": optimizer = Ranger( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "lamb": optimizer = Lamb( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "diffgrad": optimizer = DiffGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "novograd": optimizer = NovoGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB optimizer = FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD optimizer = FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam optimizer = FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) else: raise KeyError(f"Cannot get optimizer by name {optimizer_name}") # Currently either no_wd or per-group lr if no_weight_decay_on_bias: optimizer.add_param_group({"params": biases_pg, "weight_decay": 0}) return optimizer
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=1e-5, eps=1e-5, **kwargs) -> Optimizer: from torch.optim import SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests # Optimizers from torch-optimizer if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return NovoGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) # Optimizers from Apex (Fused version is faster on GPU with tensor cores) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)