def cpu_sk(self): """ Sinkhorn Knopp optimization on CPU * stores activations to RAM * does matrix-vector multiplies on CPU * slower than GPU """ # 1. aggregate inputs: N = len(self.pseudo_loader.dataset) if self.num_heads == 1: self.PS = np.zeros((N, self.num_clusters_per_head), dtype=self.dtype) else: self.PS_pre = np.zeros((N, self.presize), dtype=self.dtype) now = time.time() l_dl = len(self.pseudo_loader) time.time() batch_time = MovingAverage(intertia=0.9) self.model.headcount = 1 for batch_idx, (data, _, _selected) in enumerate(self.pseudo_loader): data = data.to(self.device) mass = data.size(0) if self.num_heads == 1: p = nn.functional.softmax(self.model(data), 1) self.PS[_selected, :] = p.detach().cpu().numpy().astype(self.dtype) else: p = self.model(data) self.PS_pre[_selected, :] = p.detach().cpu().numpy().astype(self.dtype) batch_time.update(time.time() - now) now = time.time() if batch_idx % 50 == 0: print(f"Aggregating batch {batch_idx:03}/{l_dl}, speed: {mass / batch_time.avg:04.1f}Hz", end='\r', flush=True) self.model.headcount = self.num_heads print("Aggreg of outputs took {0:.2f} min".format((time.time() - now) / 60.), flush=True) # 2. solve label assignment via sinkhorn-knopp: if self.num_heads == 1: optimize_L_sk(self, nh=0) else: for nh in range(self.num_heads): print(f"computing head {nh} ", end="\r", flush=True) tl = getattr(self.model, f"top_layer{nh:d}") time_mat = time.time() # clear memory try: del self.PS except: pass # apply last FC layer (a matmul and adding of bias) self.PS = (self.PS_pre @ tl.weight.cpu().numpy().T.astype(self.dtype) + tl.bias.cpu().numpy().astype(self.dtype)) print(f"matmul took {(time.time() - time_mat) / 60:.2f}min", flush=True) self.PS = py_softmax(self.PS, 1) optimize_L_sk(self, nh=nh) return
def aggreg_multi_gpu(model, dataloader, hc, dim, TYPE=torch.float64, model_gpus=1): """"Accumulate activations and save them on multiple GPUs * this function assumes the model is on the first `model_gpus` GPUs so that it can write the activations on the remaining ones * it splits the activations evenly between the remaining GPUs """ # number of gpus to store ngpu_store = torch.cuda.device_count() - model_gpus # number of batches in DL l_dl = len(dataloader) # number of batches each gpu gets batches_per_gpu = l_dl // ngpu_store # number of data each gpu gets points_per_gpu = batches_per_gpu * dataloader.batch_size # empty array of indices that we need to keep track of indices = torch.empty(len(dataloader.dataset), dtype=torch.long) # set up matrix PS: (N x K) when using one head, otherwise N x D, where D is the dim before the last FC layer. PS = [ torch.empty(points_per_gpu, dim, device='cuda:' + str(i), dtype=TYPE) for i in range(model_gpus, model_gpus + ngpu_store - 1) ] # accomodate remainder PS.append( torch.empty(len(dataloader.dataset) - (ngpu_store - 1) * points_per_gpu, dim, device='cuda:' + str(model_gpus + ngpu_store - 1), dtype=TYPE)) # slice sizes, i.e. how many activations will be on the gpus slices = [qq.shape[0] for qq in PS] print("slice sizes: ", slices, flush=True) batch_time = MovingAverage(intertia=0.9) now = time.time() st = 0 softmax = torch.nn.Softmax(dim=1).to('cuda:0') # switch the model to not output array but instead last-FC output for one head and pre-last activations for multi-heads model.headcount = 1 for batch_idx, (data, _, _selected) in enumerate(dataloader): data = data.to(torch.device('cuda:0')) mass = data.size(0) en = st + mass # j keeps track of which part of PS we're writing to j = min((batch_idx // batches_per_gpu), ngpu_store - 1) subs = j * points_per_gpu if hc == 1: p = softmax(model(data)).detach().to(TYPE) # when using one head: save softmax (N x K) matrix: PS[j][st - subs:en - subs, :].copy_(p) else: # when using multiple heads: save softmax (N x D) matrix PS[j][st - subs:en - subs, :].copy_(model(data).detach()) indices[st:en].copy_(_selected) st = en batch_time.update(time.time() - now) now = time.time() if batch_idx % 50 == 0: print( f"Aggregating batch {batch_idx:03}/{l_dl}, speed: {mass / batch_time.avg:04.1f}Hz. To rGPU {j + 1}", end='\r', flush=True) torch.cuda.synchronize() # just in case return PS, indices
def optimize_epoch(self, model, criterion, optimizer, loader, epoch, is_validation=False): top1 = [] top5 = [] loss_value = [] for i in range(len(model.probes)): top1.append(TotalAverage()) top5.append(TotalAverage()) loss_value.append(TotalAverage()) batch_time = MovingAverage(intertia=0.9) now = time.time() if is_validation is False: model.train() lr = self.lr_schedule(epoch) for pg in optimizer.param_groups: pg['lr'] = lr print(f"Starting epoch {epoch} with learning rate {lr}") else: model.eval() for iter, (input, label) in enumerate(loader): input = input.to('cuda:0') label = label.to('cuda:0') mass = input.size(0) total_loss = None if args.data in ['Imagenet', 'Places' ] and is_validation and args.tencrops: bs, ncrops, c, h, w = input.size() input_tensor = input.view(-1, c, h, w) input = torch.autograd.Variable(input_tensor.cuda()) else: input = torch.autograd.Variable(input.cuda()) predictions = model(input) if args.data in ['Imagenet', 'Places' ] and is_validation and args.tencrops: predictions = [ torch.squeeze(p.view(bs, ncrops, -1).mean(1)) for p in predictions ] for i, prediction in enumerate(predictions): loss = criterion(prediction, label) if total_loss is None: total_loss = loss else: total_loss = total_loss + loss top1_, top5_ = accuracy(prediction, label, topk=(1, 5)) top1[i].update(top1_.item(), mass) top5[i].update(top5_.item(), mass) loss_value[i].update(loss.item(), mass) if is_validation is False: optimizer.zero_grad() total_loss.backward() optimizer.step() batch_time.update(time.time() - now) now = time.time() top1_str = 'top1 val' if is_validation else 'top1 train' top5_str = 'top5 val' if is_validation else 'top5 train' writer.add_scalars( top1_str, {f"depth_{k+1}": top1[k].avg for k in range(len(model.probes))}, epoch) writer.add_scalars( top5_str, {f"depth_{k+1}": top5[k].avg for k in range(len(model.probes))}, epoch) writer.add_scalars('losses', { f"depth_{k+1}": loss_value[k].avg for k in range(len(model.probes)) }, epoch) if is_validation: print('VAL:') for i in range(len(model.probes)): print( f" [{i}] t1:{top1[i].avg:04.2f} loss:{loss_value[i].avg:.2f}", end='') print() else: print('TRAIN:') for i in range(len(model.probes)): print( f" [{i}] t1:{top1[i].avg:04.2f} loss:{loss_value[i].avg:.2f}", end='') print() return { "loss": [x.avg for x in loss_value], "top1": [x.avg for x in top1], "top5": [x.avg for x in top5] }
def optimize_epoch(self, model, criterion, optimizer, loader, epoch, is_validation=False): top1 = [] top5 = [] loss_value = [] top1.append(TotalAverage()) top5.append(TotalAverage()) loss_value.append(TotalAverage()) batch_time = MovingAverage(intertia=0.9) now = time.time() if is_validation is False: model.run() lr = self.lr_schedule(epoch) for pg in optimizer.param_groups: pg['lr'] = lr print("Starting epoch %s" % epoch) else: model.eval() l_dl = len(loader) for iter, q in enumerate(loader): if len(q) == 3: input, label, _s = q else: input, label = q input = input.to(self.dev) label = label.to(self.dev) mass = input.size(0) if is_validation and args.tencrops: bs, ncrops, c, h, w = input.size() input_tensor = input.view(-1, c, h, w) input = input_tensor.to(self.dev) predictions = model(input) predictions = torch.squeeze( predictions.view(bs, ncrops, -1).mean(1)) else: input = input.to(self.dev) predictions = model(input) loss = criterion(predictions, label) top1_, top5_ = accuracy(predictions, label, topk=(1, 5)) top1[0].update(top1_.item(), mass) top5[0].update(top5_.item(), mass) loss_value[0].update(loss.item(), mass) if is_validation is False: optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - now) now = time.time() if iter % 50 == 0: print( f"{'V' if is_validation else 'T'} Loss: {loss_value[0].avg:03.3f} " f"Top1: {top1[0].avg:03.1f} Top5: {top5[0].avg:03.1f} " f"{epoch: 3}/{iter:05}/{l_dl:05} Freq: {mass / batch_time.avg:04.1f}Hz:", end='\r', flush=True) if is_validation: print("validation") print("val-top1: %s" % top1[0].avg) print("val-top5: %s" % top5[0].avg) if self.writer: str_ = 'LP/val' if is_validation else 'LP/train' self.writer.add_scalar(f'{str_}/top1', top1[0].avg, epoch) self.writer.add_scalar(f'{str_}/top5', top5[0].avg, epoch) self.writer.add_scalar(f'{str_}/Freq', mass / batch_time.avg, epoch) return { "loss": [x.avg for x in loss_value], "top1": [x.avg for x in top1], "top5": [x.avg for x in top1] }
def optimize_epoch(self, model, optimizer, loader, epoch, validation=False): print(f"Starting epoch {epoch}, validation: {validation} " + "=" * 30) loss_value = AverageMeter() rotacc_value = AverageMeter() # house keeping if not validation: model.run() lr = self.lr_schedule(epoch) for pg in optimizer.param_groups: pg['lr'] = lr else: model.eval() XE = torch.nn.CrossEntropyLoss().to(self.dev) l_dl = 0 # len(loader) now = time.time() batch_time = MovingAverage(intertia=0.9) for iter, (data, label, selected) in enumerate(loader): now = time.time() if not validation: niter = epoch * len(loader.dataset) + iter * args.batch_size data = data.to(self.dev) mass = data.size(0) where = np.arange(mass, dtype=int) * 4 data = data.view(mass * 4, 3, data.size(3), data.size(4)) rotlabel = torch.tensor(range(4)).view(-1, 1).repeat(mass, 1).view(-1).to(self.dev) #################### train CNN ########################################### if not validation: final = model(data) if args.onlyrot: loss = torch.Tensor([0]).to(self.dev) else: if args.hc == 1: loss = XE(final[0][where], self.L[selected]) else: loss = torch.mean( torch.stack([XE(final[k][where], self.L[k, selected]) for k in range(args.hc)])) rotloss = XE(final[-1], rotlabel) pred = torch.argmax(final[-1], 1) total_loss = loss + rotloss optimizer.zero_grad() total_loss.backward() optimizer.step() correct = (pred == rotlabel).to(torch.float) rotacc = correct.sum() / float(mass) else: final = model(data) pred = torch.argmax(final[-1], 1) correct = (pred == rotlabel.cuda()).to(torch.float) rotacc = correct.sum() / float(mass) total_loss = torch.Tensor([0]) loss = torch.Tensor([0]) rotloss = torch.Tensor([0]) rotacc_value.update(rotacc.item(), mass) loss_value.update(total_loss.item(), mass) batch_time.update(time.time() - now) now = time.time() print( f"Loss: {loss_value.avg:03.3f}, RotAcc: {rotacc_value.avg:03.3f} | {epoch: 3}/{iter:05}/{l_dl:05} Freq: {mass / batch_time.avg:04.1f}Hz:", end='\r', flush=True) # every few iter logging if iter % args.logiter == 0: if not validation: print(niter, f" Loss: {loss.item():.3f}", flush=True) with torch.no_grad(): if not args.onlyrot: pred = torch.argmax(final[0][where], dim=1) pseudoloss = XE(final[0][where], pred) if not args.onlyrot: self.writer.add_scalar('Pseudoloss', pseudoloss.item(), niter) self.writer.add_scalar('lr', self.lr_schedule(epoch), niter) self.writer.add_scalar('Loss', loss.item(), niter) self.writer.add_scalar('RotLoss', rotloss.item(), niter) self.writer.add_scalar('RotAcc', rotacc.item(), niter) if iter > 0: self.writer.add_scalar('Freq(Hz)', mass / (time.time() - now), niter) # end of epoch logging if self.writer and (epoch % self.log_interval == 0): write_conv(self.writer, model, epoch) if validation: print('val Rot-Acc: ', rotacc_value.avg) self.writer.add_scalar('val Rot-Acc', rotacc_value.avg, epoch) files.save_checkpoint_all(self.checkpoint_dir, model, args.arch, optimizer, self.L, epoch, lowest=False) return {'loss': loss_value.avg}