def evaluate(args, model, eval_dataloader, metrics): # Eval! logger.info(" Num examples = %d", len(eval_dataloader)) logger.info(" Batch size = %d", args.eval_batch_size) eval_loss = AverageMeter() metrics.reset() preds = [] targets = [] pbar = ProgressBar(n_total=len(eval_dataloader), desc='Evaluating') for bid, batch in enumerate(eval_dataloader): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]} inputs['token_type_ids'] = batch[2] outputs = model(**inputs) loss, logits = outputs[:2] eval_loss.update(loss.item(), n=1) preds.append(logits.cpu().detach()) targets.append(inputs['labels'].cpu().detach()) pbar(bid) preds = torch.cat(preds, dim=0).cpu().detach() targets = torch.cat(targets, dim=0).cpu().detach() metrics(preds, targets) eval_log = {"eval_acc": metrics.value(), 'eval_loss': eval_loss.avg} return eval_log
def init_losses(self): # 5个损失:总损失,位置损失,置信度损失,类别损失,各检测头的损失 # 总损失是位置损失,置信度损失,类别损失3个之和,各损失均是一段时间的平均损失 # 各检测头的损失按顺序分别是32,16,8处的 self.losses = { 'loss': AverageMeter(), 'giou_loss': AverageMeter(), 'conf_loss': AverageMeter(), 'class_loss': AverageMeter(), 'loss_per_branch': [AverageMeter() for _ in range(3)], }
def eval(val_loader, model, criterion, device, out_file): ''' Run evaluation ''' losses = AverageMeter() accs = AverageMeter() # switch to eval mode model.eval() with torch.no_grad(): for i, (x, target) in enumerate(val_loader): x = x.to(device) target = target.to(device) # Forward pass logits = model(x) loss = criterion(logits, target) # measure accuracy and record loss acc = accuracy_topk(logits.data, target) accs.update(acc.item(), x.size(0)) losses.update(loss.item(), x.size(0)) text = '\n Test\t Loss ({loss.avg:.4f})\t Accuracy ({prec.avg:.3f})\n'.format( loss=losses, prec=accs) print(text) with open(out_file, 'a') as f: f.write(text)
def validate(val_loader, backbone, model, acc_prefixes, args): batch_time = AverageMeter('Time', ':.3f') # switch to evaluate mode model.eval() # TODO: Aniruddha pred_var_stack, labels_var_stack = [torch.Tensor()]*5, torch.Tensor() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output features = backbone(images) outputs = model(features) if not i: acc_meters = [ NoBatchAverageMeter('', ':11.2f') for i in range(len(outputs)) ] progress = NoTabProgressMeter( len(val_loader), [batch_time, *acc_meters], prefix='Test: ') # measure accuracy for output, acc_meter in zip(outputs, acc_meters): acc1, _ = accuracy(output, target, topk=(1, 5)) acc_meter.update(acc1[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0 or i == len(val_loader)-1: line = progress.display(i) len_prefixes = len(acc_prefixes) * len(acc_prefixes[0]) prefix_line = ' ' * (len(line) - len_prefixes) prefix_line += ''.join(acc_prefixes) logger.info(prefix_line) logger.info(line) for layer_id in range(5): pred_var_stack[layer_id] = torch.cat((pred_var_stack[layer_id], outputs[layer_id].cpu()), dim=0) labels_var_stack = torch.cat((labels_var_stack, target.cpu()), dim=0) return acc_meters, pred_var_stack, labels_var_stack
def train(train_loader): pbar = ProgressBar(n_batch=len(train_loader)) train_loss = AverageMeter() train_acc = AverageMeter() count = 0 model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output, loss = model(data, y=target, loss_fn=nn.CrossEntropyLoss()) pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() loss.backward() optimizer.step() count += data.size(0) train_acc.update(correct, n=1) pbar.batch_step(batch_idx=batch_idx, info={ 'loss': loss.item(), 'acc': correct / data.size(0) }, bar_type='Training') train_loss.update(loss.item(), n=1) print(' ') return {'loss': train_loss.avg, 'acc': train_acc.sum / count}
def validation(opt, val_loader, model, epoch): # average meters to record the training statistics batch_time = AverageMeter() data_time = AverageMeter() train_logger = LogCollector() model.val_start() end = time.time() loss = 0 for i, val_data in enumerate(val_loader): data_time.update(time.time() - end) model.logger = train_logger loss = (model.val_forward(*val_data) + loss * i) / (i + 1) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # Print log info logging.info('Epoch: [{0}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( epoch, batch_time=batch_time, data_time=data_time, e_log=str(model.logger))) return loss
def train(train_loader): pbar = ProgressBar(n_total=len(train_loader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() optimizer.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def train(train_loader, lr_scheduler=None): pbar = ProgressBar(n_batch=len(train_loader)) train_loss = AverageMeter() model.train() for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() pbar.batch_step(batch_idx=batch_idx, info={'loss': loss.item()}, bar_type='Training') train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, batch in enumerate(dataloader): b_features, b_target, b_idx = batch['features'].to( DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE) optimizer.zero_grad() with autocast(): logits, probs = model(b_features) loss = F.cross_entropy(logits, b_target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def eval(self, loader): self.model.eval() losses = AverageMeter() correct = 0 with torch.no_grad(): pbar = ImProgressBar(len(loader)) for i, (imgs, targets) in enumerate(loader): imgs, targets = imgs.cuda(), targets.cuda() outputs = self.model(imgs) _, predicted = torch.max(outputs.data, dim=1) correct += (predicted == targets).sum().item() loss = self.criterion(outputs, targets) losses.update(loss.item(), 1) pbar.update(i) pbar.finish() return losses.avg, correct / len(loader.dataset)
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() for batch_idx, batch in enumerate(dataloader): # forward probas = model.forward(batch['features']) # backward grad_w, grad_b = model.backward(batch['features'], batch['target'], probas) # manual regularization -- account for mini-batches l2_reg = model.LAMBDA * model.weights / len(dataloader) # update weights model.weights -= learning_rate * (grad_w + l2_reg) model.bias -= learning_rate * grad_b # record loss loss = model._logit_cost(batch['target'], probas) # update meter train_loss.update(loss.item(), n=1) # update progress bar pbar(step=batch_idx, info={'batch_loss': loss.item()}) return {'train_loss': train_loss.avg}
def train(self): self.model.train() losses = AverageMeter() correct = 0 pbar = ImProgressBar(len(self.train_loader)) for i, (imgs, targets) in enumerate(self.train_loader): imgs, targets = imgs.cuda(), targets.cuda() outputs = self.model(imgs) _, predicted = torch.max(outputs.data, dim=1) correct += (predicted == targets).sum().item() loss = self.criterion(outputs, targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() losses.update(loss.item(), 1) pbar.update(i) pbar.finish() return losses.avg, correct / len(self.train_loader.dataset)
def train(train_loader, backbone, model, optimizer, acc_prefixes, epoch, args): batch_time = AverageMeter('B', ':.2f') data_time = AverageMeter('D', ':.2f') # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) with torch.no_grad(): features = backbone(images) outputs = model(features) if not i: acc_meters = [ NoBatchAverageMeter('', ':>11.2f') for i in range(len(outputs)) ] progress = NoTabProgressMeter( len(train_loader), [batch_time, data_time, *acc_meters], prefix="Epoch: [{}]".format(epoch)) # measure accuracy optimizer.zero_grad() for output, acc_meter in zip(outputs, acc_meters): loss = F.cross_entropy(output, target) loss.backward() acc1, _ = accuracy(output, target, topk=(1, 5)) acc_meter.update(acc1[0], images.size(0)) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: line = progress.display(i) len_prefixes = len(acc_prefixes) * len(acc_prefixes[0]) prefix_line = ' ' * (len(line) - len_prefixes) prefix_line += ''.join(acc_prefixes) logger.info(prefix_line) logger.info(line)
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() for batch_idx, batch in enumerate(dataloader): # forward y_hat = model.forward(batch['features'].float()) # backward grad_w, grad_b = model.backward(batch['features'], batch['target'], y_hat) # manual regularization\ l2_reg = model.LAMBDA * model.weights l2_reg = l2_reg.reshape(2, 1) # update weights model.weights -= learning_rate * (grad_w + l2_reg).view(-1) model.bias -= (learning_rate * grad_b).view(-1) # record loss loss = model.loss(batch['target'], y_hat) # update meter train_loss.update(loss.item(), n=1) # update progress bar pbar(step=batch_idx, info={'batch_loss': loss.item()}) return {'train_loss': train_loss.avg}
def train(train_loader, model, criterion, optimizer, epoch, device, print_freq=25): ''' Run one train epoch ''' losses = AverageMeter() # switch to train mode model.train() for i, (id, mask, target) in enumerate(train_loader): id = id.to(device) mask = mask.to(device) target = target.to(device) # Forward pass logits = model(id, mask) loss = criterion(logits, target) # Backward pass and update optimizer.zero_grad() loss.backward() optimizer.step() # measure accuracy and record loss losses.update(loss.item(), id.size(0)) if i % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, i, len(train_loader), loss=losses))
def train(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Training') train_loss = AverageMeter() model.train() for batch_idx, batch in enumerate(dataloader): b_features, b_target, b_idx = batch['features'].to( DEVICE), batch['target'].to(DEVICE), batch['idx'].to(DEVICE) optimizer.zero_grad() with autocast(): logits, probs = model(b_features) loss = F.cross_entropy(logits, b_target) # regularize loss -- but not the intercept LAMBDA, L2 = 2, 0. for name, p in model.named_parameters(): if 'weight' in name: L2 = L2 + (p**2).sum() loss = loss + 2. / b_target.size(0) * LAMBDA * L2 scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() scheduler.step() pbar(step=batch_idx, info={'loss': loss.item()}) train_loss.update(loss.item(), n=1) return {'loss': train_loss.avg}
def eval(val_loader, model, criterion, device): ''' Run evaluation ''' losses = AverageMeter() # switch to eval mode model.eval() with torch.no_grad(): for i, (id, mask, target) in enumerate(val_loader): id = id.to(device) mask = mask.to(device) target = target.to(device) # Forward pass logits = model(id, mask) loss = criterion(logits, target) # measure accuracy and record loss losses.update(loss.item(), id.size(0)) print('Test\t Loss ({loss.avg:.4f})\n'.format(loss=losses))
def train_epoch(dataset, keep_prob=0.5, batch_size=2048): _accs = AverageMeter() _losses = AverageMeter() sampleNum = dataset.sample_number for batch in tqdm(range(sampleNum // batch_size)): X_batch, y_batch = dataset.next_batch(batch_size=batch_size) feed_dict = { X_input: X_batch, y_input: y_batch, keepProb: keep_prob, batch_size: batch_size } fetches = [acc, loss, train_op] _acc, _loss, _ = sess.run(fetches, feed_dict) _accs.update(_acc) _losses.update(_loss) return _accs, _losses
def train(opt, train_loader, model, epoch): # average meters to record the training statistics batch_time = AverageMeter() data_time = AverageMeter() train_logger = LogCollector() # switch to train mode model.train_start() end = time.time() for i, train_data in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # make sure train logger is used model.logger = train_logger # Update the model model.train_forward(*train_data) # measure elapsed time batch_time.update(time.time() - end) end = time.time() # Print log info if model.Eiters % opt.log_step == 0: logging.info( 'Epoch: [{0}][{1}/{2}]\t' '{e_log}\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, e_log=str(model.logger))) # Record logs in tensorboard tb_logger.log_value('epoch', epoch, step=model.Eiters) tb_logger.log_value('step', i, step=model.Eiters) tb_logger.log_value('batch_time', batch_time.val, step=model.Eiters) tb_logger.log_value('data_time', data_time.val, step=model.Eiters) model.logger.tb_log(tb_logger, step=model.Eiters)
def test(test_loader): pbar = ProgressBar(n_batch=len(test_loader)) valid_loss = AverageMeter() valid_acc = AverageMeter() model.eval() count = 0 with torch.no_grad(): for batch_idx, (data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) output, loss = model(data, y=target, loss_fn=nn.CrossEntropyLoss()) pred = output.argmax( dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() valid_loss.update(loss, n=data.size(0)) valid_acc.update(correct, n=1) count += data.size(0) pbar.batch_step(batch_idx=batch_idx, info={}, bar_type='Testing') return {'valid_loss': valid_loss.avg, 'valid_acc': valid_acc.sum / count}
def test(test_loader): pbar = ProgressBar(n_total=len(test_loader),desc='Testing') valid_loss = AverageMeter() valid_acc = AverageMeter() model.eval() count = 0 with torch.no_grad(): for batch_idx,(data, target) in enumerate(test_loader): data, target = data.to(device), target.to(device) output = model(data) loss = loss_fn(output, target).item() # sum up batch loss pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability correct = pred.eq(target.view_as(pred)).sum().item() valid_loss.update(loss,n = data.size(0)) valid_acc.update(correct, n=1) count += data.size(0) pbar(step=batch_idx) return {'valid_loss':valid_loss.avg, 'valid_acc':valid_acc.sum /count}
def train(train_loader, model, criterion, optimizer, epoch, device, out_file, print_freq=1): ''' Run one train epoch ''' losses = AverageMeter() accs = AverageMeter() # switch to train mode model.train() for i, (x, target) in enumerate(train_loader): x = x.to(device) target = target.to(device) # Forward pass logits = model(x) loss = criterion(logits, target) # Backward pass and update optimizer.zero_grad() loss.backward() optimizer.step() # measure accuracy and record loss acc = accuracy_topk(logits.data, target) accs.update(acc.item(), x.size(0)) losses.update(loss.item(), x.size(0)) if i % print_freq == 0: text = '\n Epoch: [{0}][{1}/{2}]\t Loss {loss.val:.4f} ({loss.avg:.4f})\t Accuracy {prec.val:.3f} ({prec.avg:.3f})'.format( epoch, i, len(train_loader), loss=losses, prec=accs) print(text) with open(out_file, 'a') as f: f.write(text)
def test(dataloader): pbar = ProgressBar(n_total=len(dataloader), desc='Testing') valid_loss = AverageMeter() valid_acc = AverageMeter() count = 0 for batch_idx, batch in enumerate(dataloader): # forward -- skip backward prop probas = model.forward(batch['features']) # record loss loss = model._logit_cost(batch['target'], probas) # get predictions prediction = torch.where(probas > 0.5, torch.tensor(1, device=device), torch.tensor(0, device=device)).view(-1) # compare correct = prediction.eq(batch['target']).sum().item() valid_loss.update(loss.item(), n=batch['features'].size(0)) valid_acc.update(correct, n=1) count += batch['features'].size(0) pbar(step=batch_idx) return {'valid_loss': valid_loss.avg, 'valid_acc': valid_acc.sum / count}
def main(args): train_info = [] best_epoch = np.zeros(5) for val_folder_index in range(5): best_balance_acc = 0 whole_train_list = ['D8E6', '117E', '676F', 'E2D7', 'BE52'] val_WSI_list = whole_train_list[val_folder_index] train_WSI_list = whole_train_list train_WSI_list.pop(val_folder_index) train_directory = '../data/finetune/1percent/' valid_directory = '../data/finetune/1percent' dataset = {} dataset_train0 = datasets.ImageFolder( root=train_directory + train_WSI_list[0], transform=get_aug(train=False, train_classifier=True, **args.aug_kwargs)) dataset_train1 = datasets.ImageFolder( root=train_directory + train_WSI_list[1], transform=get_aug(train=False, train_classifier=True, **args.aug_kwargs)) dataset_train2 = datasets.ImageFolder( root=train_directory + train_WSI_list[2], transform=get_aug(train=False, train_classifier=True, **args.aug_kwargs)) dataset_train3 = datasets.ImageFolder( root=train_directory + train_WSI_list[3], transform=get_aug(train=False, train_classifier=True, **args.aug_kwargs)) dataset['valid'] = datasets.ImageFolder( root=valid_directory + val_WSI_list, transform=get_aug(train=False, train_classifier=False, **args.aug_kwargs)) dataset['train'] = data.ConcatDataset( [dataset_train0, dataset_train1, dataset_train2, dataset_train3]) train_loader = torch.utils.data.DataLoader( dataset=dataset['train'], batch_size=args.eval.batch_size, shuffle=True, **args.dataloader_kwargs) test_loader = torch.utils.data.DataLoader( dataset=dataset['valid'], batch_size=args.eval.batch_size, shuffle=False, **args.dataloader_kwargs) model = get_backbone(args.model.backbone) classifier = nn.Linear(in_features=model.output_dim, out_features=9, bias=True).to(args.device) assert args.eval_from is not None save_dict = torch.load(args.eval_from, map_location='cpu') msg = model.load_state_dict( { k[9:]: v for k, v in save_dict['state_dict'].items() if k.startswith('backbone.') }, strict=True) # print(msg) model = model.to(args.device) model = torch.nn.DataParallel(model) classifier = torch.nn.DataParallel(classifier) # define optimizer optimizer = get_optimizer( args.eval.optimizer.name, classifier, lr=args.eval.base_lr * args.eval.batch_size / 256, momentum=args.eval.optimizer.momentum, weight_decay=args.eval.optimizer.weight_decay) # define lr scheduler lr_scheduler = LR_Scheduler( optimizer, args.eval.warmup_epochs, args.eval.warmup_lr * args.eval.batch_size / 256, args.eval.num_epochs, args.eval.base_lr * args.eval.batch_size / 256, args.eval.final_lr * args.eval.batch_size / 256, len(train_loader), ) loss_meter = AverageMeter(name='Loss') acc_meter = AverageMeter(name='Accuracy') # Start training global_progress = tqdm(range(0, args.eval.num_epochs), desc=f'Evaluating') for epoch in global_progress: loss_meter.reset() model.eval() classifier.train() local_progress = tqdm(train_loader, desc=f'Epoch {epoch}/{args.eval.num_epochs}', disable=True) for idx, (images, labels) in enumerate(local_progress): classifier.zero_grad() with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature) loss = F.cross_entropy(preds, labels.to(args.device)) loss.backward() optimizer.step() loss_meter.update(loss.item()) lr = lr_scheduler.step() local_progress.set_postfix({ 'lr': lr, "loss": loss_meter.val, 'loss_avg': loss_meter.avg }) writer.add_scalar('Valid/Loss', loss_meter.avg, epoch) writer.add_scalar('Valid/Lr', lr, epoch) writer.flush() PATH = 'checkpoint/exp_0228_triple_1percent/' + val_WSI_list + '/' + val_WSI_list + '_tunelinear_' + str( epoch) + '.pth' torch.save(classifier, PATH) classifier.eval() correct, total = 0, 0 acc_meter.reset() pred_label_for_f1 = np.array([]) true_label_for_f1 = np.array([]) for idx, (images, labels) in enumerate(test_loader): with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature).argmax(dim=1) correct = (preds == labels.to(args.device)).sum().item() preds_arr = preds.cpu().detach().numpy() labels_arr = labels.cpu().detach().numpy() pred_label_for_f1 = np.concatenate( [pred_label_for_f1, preds_arr]) true_label_for_f1 = np.concatenate( [true_label_for_f1, labels_arr]) acc_meter.update(correct / preds.shape[0]) f1 = f1_score(true_label_for_f1, pred_label_for_f1, average='macro') balance_acc = balanced_accuracy_score(true_label_for_f1, pred_label_for_f1) print('Epoch: ', str(epoch), f'Accuracy = {acc_meter.avg * 100:.2f}') print('F1 score = ', f1, 'balance acc: ', balance_acc) if balance_acc > best_balance_acc: best_epoch[val_folder_index] = epoch best_balance_acc = balance_acc train_info.append([val_WSI_list, epoch, f1, balance_acc]) with open('checkpoint/exp_0228_triple_1percent/train_info.csv', 'w') as f: # using csv.writer method from CSV package write = csv.writer(f) write.writerows(train_info) print(best_epoch)
def train_step(self, epoch, trainloader): """Training step for each epoch. Args: epoch: current epoch trainloader: dataloader for train set Return: None """ self.model.train() epoch_loss = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() metrics_meter = dict() for k in self.measures.train.keys(): metrics_meter[k] = AverageMeter() bar = Bar('Processing', max=len(trainloader)) end = time.time() for batch_idx, (data, targets) in enumerate(trainloader): data_time.update(time.time() - end) if self.config.cuda: data, targets = data.cuda(), targets.cuda() preds = self.model(data) loss = self.model.loss_function(preds, targets) # backward self.optimizer.zero_grad() loss.backward() self.optimizer.step() # calculate measure statistics batch_measure = dict() for k, func in self.metrics_func.items(): if k.startswith('top'): batch_measure[k] = func(preds, targets)[0] else: batch_measure[k] = func(preds, targets) if isinstance(batch_measure[k], t.autograd.Variable): batch_measure[k] = batch_measure[k].item() metrics_meter[k].update(batch_measure[k], data.size(0)) # record statistics self.stats.batch_loss.append(loss.item()) epoch_loss.update(loss.item()) batch_time.update(time.time() - end) end = time.time() #plot progress measure_bar = ' | '.join( ['%s : %.4f' % (k, v.avg) for k, v in metrics_meter.items()]) bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | '.format( batch=batch_idx + 1, size=len(trainloader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=epoch_loss.avg) + measure_bar bar.next() bar.finish() for k in metrics_meter.keys(): self.measures.val[k] = metrics_meter[k].avg # plot on tensorboard ''' for k, v in metrics_meter.items(): self.metrics[k].append(v.avg) log_value('train %s' % k, v.avg, epoch) ''' self.stats.train_epoch_loss.append(epoch_loss.avg) # log_value('epoch_loss', epoch_loss.avg, epoch) logger.info( ('%02i - ' % (epoch + 1)) + ' / '.join(['train loss %.5f' % epoch_loss.avg] + [k + ' %.5f' % v.avg for k, v in metrics_meter.items()]))
def validate(self, epoch, valdataloader): self.model.eval() epoch_loss = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() metrics_meter = dict() for k in self.measures.val.keys(): metrics_meter[k] = AverageMeter() end = time.time() bar = Bar('Processing', max=len(valdataloader)) with t.no_grad(): for batch_idx, (data, targets) in enumerate(valdataloader): data_time.update(time.time() - end) if self.config.cuda: data, targets = data.cuda(), targets.cuda() preds = self.model(data) loss = self.model.loss_function(preds, targets) # calculate measure statistics batch_measure = dict() for k, func in self.metrics_func.items(): if k.startswith('top'): batch_measure[k] = func(preds, targets)[0] else: batch_measure[k] = func(preds, targets) if isinstance(batch_measure[k], t.autograd.Variable): batch_measure[k] = batch_measure[k].item() metrics_meter[k].update(batch_measure[k], data.size(0)) self.stats.batch_loss.append(loss.item()) epoch_loss.update(loss.item()) batch_time.update(time.time() - end) end = time.time() #plot progress measure_bar = ' | '.join([ '%s : %.4f' % (k, v.avg) for k, v in metrics_meter.items() ]) bar.suffix = '({batch}/{size}) Data: {data:.3f}s | Batch: {bt:.3f}s | Total: {total:} | ETA: {eta:} | Loss: {loss:.4f} | '.format( batch=batch_idx + 1, size=len(valdataloader), data=data_time.avg, bt=batch_time.avg, total=bar.elapsed_td, eta=bar.eta_td, loss=epoch_loss.avg) + measure_bar bar.next() bar.finish() for k in metrics_meter.keys(): self.measures.val[k] = metrics_meter[k] self.stats.eval_epoch_loss.append(epoch_loss.avg) #log_value('val loss', epoch_loss.avg, epoch) #for k, v in metrics_meter.items(): # self.metrics[k].append(v.avg) # log_value('val %s' % k, v.avg, epoch) to_log = dict([('epoch', epoch)] + [(k, v.avg) for k, v in metrics_meter.items()]) logger.debug("__log__:%s" % json.dumps(to_log)) return to_log
def main(args): train_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=True), train=True, download=args.download, # default is False debug_subset_size=args.batch_size if args.debug else None ) test_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=False), train=False, download=args.download, # default is False debug_subset_size=args.batch_size if args.debug else None ) train_loader = torch.utils.data.DataLoader( dataset=train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True ) test_loader = torch.utils.data.DataLoader( dataset=test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=True ) model = get_backbone(args.backbone) classifier = nn.Linear(in_features=model.output_dim, out_features=10, bias=True).to(args.device) assert args.eval_from is not None save_dict = torch.load(args.eval_from, map_location='cpu') msg = model.load_state_dict({k[9:]:v for k, v in save_dict['state_dict'].items() if k.startswith('backbone.')}, strict=True) # print(msg) model = model.to(args.device) model = torch.nn.DataParallel(model) # if torch.cuda.device_count() > 1: classifier = torch.nn.SyncBatchNorm.convert_sync_batchnorm(classifier) classifier = torch.nn.DataParallel(classifier) # define optimizer optimizer = get_optimizer( args.optimizer, classifier, lr=args.base_lr*args.batch_size/256, momentum=args.momentum, weight_decay=args.weight_decay) # define lr scheduler lr_scheduler = LR_Scheduler( optimizer, args.warmup_epochs, args.warmup_lr*args.batch_size/256, args.num_epochs, args.base_lr*args.batch_size/256, args.final_lr*args.batch_size/256, len(train_loader), ) loss_meter = AverageMeter(name='Loss') acc_meter = AverageMeter(name='Accuracy') # Start training global_progress = tqdm(range(0, args.num_epochs), desc=f'Evaluating') for epoch in global_progress: loss_meter.reset() model.eval() classifier.train() local_progress = tqdm(train_loader, desc=f'Epoch {epoch}/{args.num_epochs}', disable=args.hide_progress) for idx, (images, labels) in enumerate(local_progress): classifier.zero_grad() with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature) loss = F.cross_entropy(preds, labels.to(args.device)) loss.backward() optimizer.step() loss_meter.update(loss.item()) lr = lr_scheduler.step() local_progress.set_postfix({'lr':lr, "loss":loss_meter.val, 'loss_avg':loss_meter.avg}) if args.head_tail_accuracy and epoch != 0 and (epoch+1) != args.num_epochs: continue local_progress=tqdm(test_loader, desc=f'Test {epoch}/{args.num_epochs}', disable=args.hide_progress) classifier.eval() correct, total = 0, 0 acc_meter.reset() for idx, (images, labels) in enumerate(local_progress): with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature).argmax(dim=1) correct = (preds == labels.to(args.device)).sum().item() acc_meter.update(correct/preds.shape[0]) local_progress.set_postfix({'accuracy': acc_meter.avg}) global_progress.set_postfix({"epoch":epoch, 'accuracy':acc_meter.avg*100})
def main(args, model=None): assert args.eval_from is not None or model is not None train_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=True), train=True, download=args.download, # default is False debug_subset_size=args.batch_size if args.debug else None # Use a subset of dataset for debugging. ) test_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=False), train=False, download=args.download, # default is False debug_subset_size=args.batch_size if args.debug else None) train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) test_loader = torch.utils.data.DataLoader(dataset=test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=True) model = get_backbone(args.backbone) classifier = nn.Linear(in_features=model.output_dim, out_features=len(train_set.classes), bias=True).to(args.device) if args.local_rank >= 0 and not torch.distributed.is_initialized(): torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend="nccl", init_method="env://") if model is None: model = get_backbone(args.backbone).to(args.device) save_dict = torch.load(args.eval_from, map_location=args.device) model.load_state_dict( { k[9:]: v for k, v in save_dict['state_dict'].items() if k.startswith('backbone.') }, strict=True) output_dim = model.output_dim if args.local_rank >= 0: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) classifier = nn.Linear(in_features=output_dim, out_features=10, bias=True).to(args.device) if args.local_rank >= 0: classifier = torch.nn.parallel.DistributedDataParallel( classifier, device_ids=[args.local_rank], output_device=args.local_rank) # define optimizer optimizer = get_optimizer(args.optimizer, classifier, lr=args.base_lr * args.batch_size / 256, momentum=args.momentum, weight_decay=args.weight_decay) # TODO: linear lr warm up for byol simclr swav # args.warm_up_epochs # define lr scheduler lr_scheduler = LR_Scheduler(optimizer, args.warmup_epochs, args.warmup_lr * args.batch_size / 256, args.num_epochs, args.base_lr * args.batch_size / 256, args.final_lr * args.batch_size / 256, len(train_loader)) loss_meter = AverageMeter(name='Loss') acc_meter = AverageMeter(name='Accuracy') # Start training global_progress = tqdm(range(0, args.num_epochs), desc=f'Evaluating') for epoch in global_progress: loss_meter.reset() model.eval() classifier.train() local_progress = tqdm(train_loader, desc=f'Epoch {epoch}/{args.num_epochs}', disable=args.hide_progress) for idx, (images, labels) in enumerate(local_progress): classifier.zero_grad() with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature) loss = F.cross_entropy(preds, labels.to(args.device)) loss.backward() optimizer.step() loss_meter.update(loss.item()) lr = lr_scheduler.step() local_progress.set_postfix({ 'lr': lr, "loss": loss_meter.val, 'loss_avg': loss_meter.avg }) if args.head_tail_accuracy and epoch != 0 and (epoch + 1) != args.num_epochs: continue local_progress = tqdm(test_loader, desc=f'Test {epoch}/{args.num_epochs}', disable=args.hide_progress) classifier.eval() correct, total = 0, 0 acc_meter.reset() for idx, (images, labels) in enumerate(local_progress): with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature).argmax(dim=1) correct = (preds == labels.to(args.device)).sum().item() acc_meter.update(correct / preds.shape[0]) local_progress.set_postfix({'accuracy': acc_meter.avg}) global_progress.set_postfix({ "epoch": epoch, 'accuracy': acc_meter.avg * 100 })
def main(args): train_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=True), train=True, download=args.download # default is False ) test_set = get_dataset( args.dataset, args.data_dir, transform=get_aug(args.model, args.image_size, train=False, train_classifier=False), train=False, download=args.download # default is False ) if args.debug: args.batch_size = 20 args.num_epochs = 2 args.num_workers = 0 train_set = torch.utils.data.Subset(train_set, range( 0, args.batch_size)) # take only one batch test_set = torch.utils.data.Subset(test_set, range(0, args.batch_size)) train_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True, drop_last=True) test_loader = torch.utils.data.DataLoader(dataset=train_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True, drop_last=True) # define model # model = get_model(args.model, args.backbone) backbone = get_backbone(args.backbone, castrate=False) in_features = backbone.fc.in_features backbone.fc = nn.Identity() model = backbone assert args.eval_from is not None save_dict = torch.load(args.eval_from, map_location='cpu') msg = model.load_state_dict( { k[9:]: v for k, v in save_dict['state_dict'].items() if k.startswith('backbone.') }, strict=True) print(msg) model = model.to(args.device) model = torch.nn.DataParallel(model) # if torch.cuda.device_count() > 1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) classifier = nn.Linear(in_features=in_features, out_features=10, bias=True).to(args.device) classifier = torch.nn.DataParallel(classifier) # breakpoint() # define optimizer optimizer = get_optimizer(args.optimizer, classifier, lr=args.base_lr * args.batch_size / 256, momentum=args.momentum, weight_decay=args.weight_decay) # TODO: linear lr warm up for byol simclr swav # args.warm_up_epochs # define lr scheduler lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_epochs, eta_min=0) loss_meter = AverageMeter(name='Loss') acc_meter = AverageMeter(name='Accuracy') # Start training for epoch in tqdm(range(0, args.num_epochs), desc=f'Evaluating'): loss_meter.reset() model.eval() classifier.train() p_bar = tqdm(train_loader, desc=f'Epoch {epoch}/{args.num_epochs}') for idx, (images, labels) in enumerate(p_bar): # breakpoint() classifier.zero_grad() with torch.no_grad(): feature = model(images.to(args.device)) # breakpoint() preds = classifier(feature) loss = F.cross_entropy(preds, labels.to(args.device)) # loss = model.forward(images1.to(args.device), images2.to(args.device)) loss.backward() optimizer.step() loss_meter.update(loss.item()) p_bar.set_postfix({ "loss": loss_meter.val, 'loss_avg': loss_meter.avg }) lr_scheduler.step() p_bar = tqdm(test_loader, desc=f'Test {epoch}/{args.num_epochs}') classifier.eval() correct, total = 0, 0 acc_meter.reset() for idx, (images, labels) in enumerate(p_bar): with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature).argmax(dim=1) correct = (preds == labels.to(args.device)).sum().item() acc_meter.update(correct / preds.shape[0]) p_bar.set_postfix({'accuracy': acc_meter.avg})
def main(args): train_loader = torch.utils.data.DataLoader(dataset=get_dataset( transform=get_aug(train=False, train_classifier=True, **args.aug_kwargs), train=True, **args.dataset_kwargs), batch_size=args.eval.batch_size, shuffle=True, **args.dataloader_kwargs) test_loader = torch.utils.data.DataLoader(dataset=get_dataset( transform=get_aug(train=False, train_classifier=False, **args.aug_kwargs), train=False, **args.dataset_kwargs), batch_size=args.eval.batch_size, shuffle=False, **args.dataloader_kwargs) model = get_backbone(args.model.backbone) classifier = nn.Linear(in_features=model.output_dim, out_features=10, bias=True).to(args.device) assert args.eval_from is not None save_dict = torch.load(args.eval_from, map_location='cpu') msg = model.load_state_dict( { k[9:]: v for k, v in save_dict['state_dict'].items() if k.startswith('backbone.') }, strict=True) # print(msg) model = model.to(args.device) model = torch.nn.DataParallel(model) # if torch.cuda.device_count() > 1: classifier = torch.nn.SyncBatchNorm.convert_sync_batchnorm(classifier) classifier = torch.nn.DataParallel(classifier) # define optimizer optimizer = get_optimizer(args.eval.optimizer.name, classifier, lr=args.eval.base_lr * args.eval.batch_size / 256, momentum=args.eval.optimizer.momentum, weight_decay=args.eval.optimizer.weight_decay) # define lr scheduler lr_scheduler = LR_Scheduler( optimizer, args.eval.warmup_epochs, args.eval.warmup_lr * args.eval.batch_size / 256, args.eval.num_epochs, args.eval.base_lr * args.eval.batch_size / 256, args.eval.final_lr * args.eval.batch_size / 256, len(train_loader), ) loss_meter = AverageMeter(name='Loss') acc_meter = AverageMeter(name='Accuracy') # Start training global_progress = tqdm(range(0, args.eval.num_epochs), desc=f'Evaluating') for epoch in global_progress: loss_meter.reset() model.eval() classifier.train() local_progress = tqdm(train_loader, desc=f'Epoch {epoch}/{args.eval.num_epochs}', disable=True) for idx, (images, labels) in enumerate(local_progress): # this will take the images and stick them to one another using the batch dimension # so it expects [C x H x W] and will turn each into a [1 x C x H x W] and then for N it will # concatenate them into a big tensor of [N x C x H x W] if type(images) == list: print(images[1].shape, len(images)) images = torch.cat( [image.unsqueeze(dim=0) for image in images], dim=0) classifier.zero_grad() with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature) loss = F.cross_entropy(preds, labels.to(args.device)) loss.backward() optimizer.step() loss_meter.update(loss.item()) lr = lr_scheduler.step() local_progress.set_postfix({ 'lr': lr, "loss": loss_meter.val, 'loss_avg': loss_meter.avg }) classifier.eval() correct, total = 0, 0 acc_meter.reset() for idx, (images, labels) in enumerate(test_loader): with torch.no_grad(): feature = model(images.to(args.device)) preds = classifier(feature).argmax(dim=1) correct = (preds == labels.to(args.device)).sum().item() acc_meter.update(correct / preds.shape[0]) print(f'Accuracy = {acc_meter.avg*100:.2f}')