param.data.copy_(v[offset:offset + param.numel()].view( param.size()).to(args.device)) offset += param.numel() utils.bn_update(train_loader, model) print("Performance of model", i, "on the curve", end=":") utils.mata_eval(model, meta_valloader, meta_testloader, 'curve') else: assert len(args.checkpoint) == 1 if args.subspace == 'MNPCA_MANY': swag_model = SWAG( model_cfg.base, args.subspace, { 'max_rank': 20, 'cov_mat': args.sample_collect #'pca_rank': args.rank, }, 1e-6, *model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) swag_model_pca = SWAGPCA(model_cfg.base, 'pca', { 'max_rank': 20, 'pca_rank': args.rank, }, 1e-6, *model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) swag_model.to(args.device)
# args.num_workers, # transform_train=model_cfg.transform_train, # transform_test=model_cfg.transform_test, # shuffle_train=True, # use_validation=not args.use_test, # split_classes=args.split_classes # ) print('Preparing model') print(*model_cfg.args) swag_model = SWAG(model_cfg.base, num_classes=num_classes, subspace_type='pca', subspace_kwargs={ 'max_rank': args.max_rank, 'pca_rank': args.rank, }, *model_cfg.args, **model_cfg.kwargs) swag_model.to(args.device) print('Loading: %s' % args.checkpoint) ckpt = torch.load(args.checkpoint) swag_model.load_state_dict(ckpt['state_dict'], strict=False) swag_model.set_swa() #print("SWA:", utils.eval(loaders["test"], swag_model, criterion=losses.cross_entropy)) utils.mata_eval(swag_model, meta_valloader, meta_testloader, 'SWAG:') mean, var, cov_factor = swag_model.get_space()
offset = 0 for param in model.parameters(): param.data.copy_(v[offset:offset + param.numel()].view( param.size()).to(args.device)) offset += param.numel() utils.bn_update(train_loader, model) print("Performance of model", i, "on the curve", end=":") utils.mata_eval(model, meta_valloader, meta_testloader, 'curve') else: assert len(args.checkpoint) == 1 swag_model = SWAG(model_cfg.base, subspace_type='pca', subspace_kwargs={ 'max_rank': 20, 'pca_rank': args.rank, }, num_classes=num_classes, args=model_cfg.args, kwargs=model_cfg.kwargs) swag_model.to(args.device) print('Loading: %s' % args.checkpoint[0]) ckpt = torch.load(args.checkpoint[0]) swag_model.load_state_dict(ckpt['state_dict'], strict=False) # first take as input SWA swag_model.set_swa() utils.bn_update(train_loader, swag_model) # print(utils.eval(meta_testloader, swag_model, losses.cross_entropy))
# ) print('Preparing model') print(*model_cfg.args) model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) model.to(args.device) if args.cov_mat: args.no_cov_mat = False else: args.no_cov_mat = True if args.swag: print('SWAG training') swag_model = SWAG(model_cfg.base, args.subspace, subspace_kwargs={'max_rank': args.max_num_models}, num_classes=num_classes, args=model_cfg.args, kwargs=model_cfg.kwargs) swag_model.to(args.device) else: print('SGD training') def schedule(epoch, args): steps = np.sum(epoch > np.asarray(args.lr_decay_epochs)) t = (epoch) / (args.swag_start if args.swag else args.epochs) lr_ratio = args.swag_lr / args.lr_init if args.swag else 0.01 if t <= 0.5: factor = 1.0 elif t <= 0.9: factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4 else:
def __init__(self, base, epochs, criterion, batch_size=50, lr_init=1e-2, momentum=0.9, wd=1e-4, swag_lr=1e-3, swag_freq=1, swag_start=50, subspace_type='pca', subspace_kwargs={'max_rank': 20}, use_cuda=False, use_swag=False, double_bias_lr=False, model_variance=True, num_samples=30, scale=0.5, const_lr=False, *args, **kwargs): self.base = base self.model = base(*args, **kwargs) num_pars = 0 for p in self.model.parameters(): num_pars += p.numel() print('number of parameters: ', num_pars) if use_cuda: self.model.cuda() if use_swag: self.swag_model = SWAG(base, subspace_type=subspace_type, subspace_kwargs=subspace_kwargs, *args, **kwargs) if use_cuda: self.swag_model.cuda() else: self.swag_model = None self.use_cuda = use_cuda if not double_bias_lr: pars = self.model.parameters() else: pars = [] for name, module in self.model.named_parameters(): if 'bias' in str(name): print('Doubling lr of ', name) pars.append({'params': module, 'lr': 2.0 * lr_init}) else: pars.append({'params': module, 'lr': lr_init}) self.optimizer = torch.optim.SGD(pars, lr=lr_init, momentum=momentum, weight_decay=wd) self.const_lr = const_lr self.batch_size = batch_size # TODO: set up criterions better for classification if model_variance: self.criterion = criterion(noise_var=None) else: self.criterion = criterion(noise_var=1.0) if self.criterion.noise_var is not None: self.var = self.criterion.noise_var self.epochs = epochs self.lr_init = lr_init self.use_swag = use_swag self.swag_start = swag_start self.swag_lr = swag_lr self.swag_freq = swag_freq self.num_samples = num_samples self.scale = scale
class RegressionRunner(RegressionModel): def __init__(self, base, epochs, criterion, batch_size=50, lr_init=1e-2, momentum=0.9, wd=1e-4, swag_lr=1e-3, swag_freq=1, swag_start=50, subspace_type='pca', subspace_kwargs={'max_rank': 20}, use_cuda=False, use_swag=False, double_bias_lr=False, model_variance=True, num_samples=30, scale=0.5, const_lr=False, *args, **kwargs): self.base = base self.model = base(*args, **kwargs) num_pars = 0 for p in self.model.parameters(): num_pars += p.numel() print('number of parameters: ', num_pars) if use_cuda: self.model.cuda() if use_swag: self.swag_model = SWAG(base, subspace_type=subspace_type, subspace_kwargs=subspace_kwargs, *args, **kwargs) if use_cuda: self.swag_model.cuda() else: self.swag_model = None self.use_cuda = use_cuda if not double_bias_lr: pars = self.model.parameters() else: pars = [] for name, module in self.model.named_parameters(): if 'bias' in str(name): print('Doubling lr of ', name) pars.append({'params': module, 'lr': 2.0 * lr_init}) else: pars.append({'params': module, 'lr': lr_init}) self.optimizer = torch.optim.SGD(pars, lr=lr_init, momentum=momentum, weight_decay=wd) self.const_lr = const_lr self.batch_size = batch_size # TODO: set up criterions better for classification if model_variance: self.criterion = criterion(noise_var=None) else: self.criterion = criterion(noise_var=1.0) if self.criterion.noise_var is not None: self.var = self.criterion.noise_var self.epochs = epochs self.lr_init = lr_init self.use_swag = use_swag self.swag_start = swag_start self.swag_lr = swag_lr self.swag_freq = swag_freq self.num_samples = num_samples self.scale = scale def train(self, model, loader, optimizer, criterion, lr_init=1e-2, epochs=3000, swag_model=None, swag=False, swag_start=2000, swag_freq=50, swag_lr=1e-3, print_freq=100, use_cuda=False, const_lr=False): # copied from pavels regression notebook if const_lr: lr = lr_init train_res_list = [] for epoch in range(epochs): if not const_lr: t = (epoch + 1) / swag_start if swag else (epoch + 1) / epochs lr_ratio = swag_lr / lr_init if swag else 0.05 if t <= 0.5: factor = 1.0 elif t <= 0.9: factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4 else: factor = lr_ratio lr = factor * lr_init adjust_learning_rate(optimizer, factor) train_res = utils.train_epoch(loader, model, criterion, optimizer, cuda=use_cuda, regression=True) train_res_list.append(train_res) if swag and epoch > swag_start: swag_model.collect_model(model) if (epoch % print_freq == 0 or epoch == epochs - 1): print('Epoch %d. LR: %g. Loss: %.4f' % (epoch, lr, train_res['loss'])) return train_res_list def fit(self, features, labels): self.features, self.labels = torch.FloatTensor( features), torch.FloatTensor(labels) # construct data loader # may want to turn shuffle = False for the very smallest datasets (e.g. uci small) self.data_loader = DataLoader(TensorDataset(self.features, self.labels), batch_size=self.batch_size, shuffle=True) # now train with pre-specified options result = self.train(model=self.model, loader=self.data_loader, optimizer=self.optimizer, criterion=self.criterion, lr_init=self.lr_init, swag_model=self.swag_model, swag=self.use_swag, swag_start=self.swag_start, swag_freq=self.swag_freq, swag_lr=self.swag_lr, use_cuda=self.use_cuda, epochs=self.epochs, const_lr=self.const_lr) if self.criterion.noise_var is not None: # another forwards pass through network to estimate noise variance preds, targets = utils.predictions(model=self.model, test_loader=self.data_loader, regression=True, cuda=self.use_cuda) self.var = np.power(np.linalg.norm(preds - targets), 2.0) / targets.shape[0] print(self.var) return result def predict(self, features, swag_model=None): """ default prediction method is to use built in Low rank Gaussian SWA: scale = 0.0, num_samples = 1 """ swag_model = swag_model if swag_model is not None else self.swag_model if self.use_cuda: device = torch.device('cuda') else: device = torch.device('cpu') with torch.no_grad(): if swag_model is None: self.model.eval() preds = self.model( torch.FloatTensor(features).to(device)).data.cpu() if preds.size(1) == 1: var = torch.ones_like(preds[:, 0]).unsqueeze(1) * self.var else: var = preds[:, 1].view(-1, 1) preds = preds[:, 0].view(-1, 1) print(var.mean()) else: prediction = 0 sq_prediction = 0 for _ in range(self.num_samples): swag_model.sample(scale=self.scale) current_prediction = swag_model( torch.FloatTensor(features).to(device)).data.cpu() prediction += current_prediction if current_prediction.size(1) == 2: #convert to standard deviation current_prediction[:, 1] = current_prediction[:, 1]**0.5 sq_prediction += current_prediction**2.0 # preds = bma/(self.num_samples) # compute mean of prediction # \mu^* preds = (prediction[:, 0] / self.num_samples).view(-1, 1) # 1/M \sum(\sigma^2(x) + \mu^2(x)) - \mu*^2 var = torch.sum(sq_prediction, 1, keepdim=True ) / self.num_samples - preds.pow(2.0) # add variance if not heteroscedastic if prediction.size(1) == 1: var = var + self.var return preds.numpy(), var.numpy()
print('Preparing model') print(*model_cfg.args) model = model_cfg.base(*model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) model.to(args.device) if args.cov_mat: args.no_cov_mat = False else: args.no_cov_mat = True if args.swag: print('SWAG training') swag_model = SWAG(model_cfg.base, subspace_type=args.subspace, subspace_kwargs={'max_rank': args.max_num_models}, *model_cfg.args, num_classes=num_classes, **model_cfg.kwargs) swag_model.to(args.device) else: print('SGD training') def schedule(epoch): t = (epoch) / (args.swag_start if args.swag else args.epochs) lr_ratio = args.swag_lr / args.lr_init if args.swag else 0.01 if t <= 0.5: factor = 1.0 elif t <= 0.9: factor = 1.0 - (1.0 - lr_ratio) * (t - 0.5) / 0.4 else: