def train(epoch): model.train() train_loss = AverageMeter() pbar = tqdm(total=len(train_loader)) for batch_idx, (index, response, _, mask) in enumerate(train_loader): mb = response.size(0) index = index.to(device) response = response.to(device) mask = mask.long().to(device) annealing_factor = get_annealing_factor(epoch, batch_idx) optimizer.zero_grad() outputs = model(index, response, mask) loss = model.elbo(*outputs, annealing_factor=annealing_factor) loss.backward() optimizer.step() train_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': train_loss.avg}) pbar.close() print('====> Train Epoch: {} Loss: {:.4f}'.format(epoch, train_loss.avg)) return train_loss.avg
def val(val_loader, model): val_nmi = AverageMeter() model.eval() start_idx = 0 with torch.no_grad(): for it, (idx, inputs, labels) in enumerate(val_loader): # ============ multi-res forward passes ... ============ emb, output = model(inputs) emb = emb.detach() bs = inputs[0].size(0) # ============ deepcluster-v2 val nmi ... ============ nmi = 0 for h in range(len(args.nmb_prototypes)): scores = output[h] / args.temperature _, cluster_assignments = scores.max(1) nmi += normalized_mutual_info_score( labels.repeat(sum(args.nmb_crops)).cpu().numpy(), cluster_assignments.cpu().numpy()) nmi /= len(args.nmb_prototypes) # ============ misc ... ============ val_nmi.update(nmi) return val_nmi.avg
def get_log_marginal_density(loader): model.eval() meter = AverageMeter() pbar = tqdm(total=len(loader)) with torch.no_grad(): for _, response, _, mask in loader: mb = response.size(0) response = response.to(device) mask = mask.long().to(device) marginal = model.log_marginal( response, mask, num_samples = args.num_posterior_samples, ) marginal = torch.mean(marginal) meter.update(marginal.item(), mb) pbar.update() pbar.set_postfix({'Marginal': meter.avg}) pbar.close() print('====> Marginal: {:.4f}'.format(meter.avg)) return meter.avg
def get_log_marginal_density(loader): model.eval() meter = AverageMeter() pbar = tqdm(total=len(loader)) with torch.no_grad(): for _, response, _, mask in loader: mb = response.size(0) response = response.to(device) mask = mask.long().to(device) posterior = Importance( model.model, guide=model.guide, num_samples=args.num_posterior_samples, ) posterior = posterior.run(response, mask) log_weights = torch.stack(posterior.log_weights) marginal = torch.logsumexp(log_weights, 0) - math.log( log_weights.size(0)) meter.update(marginal.item(), mb) pbar.update() pbar.set_postfix({'Marginal': meter.avg}) pbar.close() print('====> Marginal: {:.4f}'.format(meter.avg)) return meter.avg
def meta_val(self, model, meta_val_way, meta_val_shot, disable_tqdm, callback, epoch): top1 = AverageMeter() model.eval() with torch.no_grad(): tqdm_test_loader = warp_tqdm(self.val_loader, disable_tqdm) for i, (inputs, target, _) in enumerate(tqdm_test_loader): inputs, target = inputs.to(self.device), target.to( self.device, non_blocking=True) output = model(inputs, feature=True)[0].cuda(0) train_out = output[:meta_val_way * meta_val_shot] train_label = target[:meta_val_way * meta_val_shot] test_out = output[meta_val_way * meta_val_shot:] test_label = target[meta_val_way * meta_val_shot:] train_out = train_out.reshape(meta_val_way, meta_val_shot, -1).mean(1) train_label = train_label[::meta_val_shot] prediction = self.metric_prediction(train_out, test_out, train_label) acc = (prediction == test_label).float().mean() top1.update(acc.item()) if not disable_tqdm: tqdm_test_loader.set_description('Acc {:.2f}'.format( top1.avg * 100)) if callback is not None: callback.scalar('val_acc', epoch + 1, top1.avg, title='Val acc') return top1.avg
def train(epoch): model.train() train_loss = AverageMeter() pbar = tqdm(total=len(train_loader)) for batch_idx, (index, response, _, mask) in enumerate(train_loader): mb = response.size(0) index = index.to(device) response = response.to(device) mask = mask.long().to(device) optimizer.zero_grad() response_mu = model(index, response, mask) loss = F.binary_cross_entropy(response_mu, response.float(), reduction='none') loss = loss * mask loss = loss.mean() loss.backward() optimizer.step() train_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': train_loss.avg}) pbar.close() print('====> Train Epoch: {} Loss: {:.4f}'.format(epoch, train_loss.avg)) return train_loss.avg
def test(model, criterion, test_loader, run_config): device = torch.device(run_config['device']) model.eval() loss_meter = AverageMeter() correct_meter = AverageMeter() start = time.time() with torch.no_grad(): for step, (data, targets) in enumerate(test_loader): data = data.to(device) targets = targets.to(device) outputs = model(data) loss = criterion(outputs, targets) _, preds = torch.max(outputs, dim=1) loss_ = loss.item() correct_ = preds.eq(targets).sum().item() num = data.size(0) loss_meter.update(loss_, num) correct_meter.update(correct_, 1) accuracy = correct_meter.sum / len(test_loader.dataset) elapsed = time.time() - start test_log = collections.OrderedDict({ 'loss': loss_meter.avg, 'accuracy': accuracy, 'time': elapsed }) return test_log
def validate_with_softmax(val_loader, model, criterion, epoch, writer=None, threshold=0.5): # switch to evaluate mode model.eval() losses = AverageMeter('Loss', ":.4e") top1 = AverageMeter('Acc@1', ':6.2f') pbar = tqdm(val_loader) with torch.no_grad(): for i, (images, target) in enumerate(pbar): if torch.cuda.is_available(): images = images.cuda() target = target.cuda() # compute output output = model(images) loss = criterion(output, target) acc1 = accuracy(output, target, topk=(1,)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0][0], images.size(0)) pbar.set_description('Validation') print(" * Acc@1 {top1.avg:.3f}".format(top1=top1)) if writer: writer.add_scalar('Test/Loss', losses.avg, epoch) writer.add_scalar('Test/Top1_acc', top1.avg, epoch) return top1.avg
def test(epoch): model.eval() test_loss = AverageMeter() pbar = tqdm(total=len(test_loader)) with torch.no_grad(): for _, response, _, mask in test_loader: mb = response.size(0) response = response.to(device) mask = mask.long().to(device) if args.n_norm_flows > 0: ( response, mask, response_mu, ability_k, ability, ability_mu, ability_logvar, ability_logabsdetjac, item_feat_k, item_feat, item_feat_mu, item_feat_logvar, item_feat_logabsdetjac, ) = model(response, mask) loss = model.elbo( response, mask, response_mu, ability, ability_mu, ability_logvar, item_feat, item_feat_mu, item_feat_logvar, use_kl_divergence=False, ability_k=ability_k, item_feat_k=item_feat_k, ability_logabsdetjac=ability_logabsdetjac, item_logabsdetjac=item_feat_logabsdetjac, ) else: outputs = model(response, mask) loss = model.elbo(*outputs) test_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': test_loss.avg}) pbar.close() print('====> Test Epoch: {} Loss: {:.4f}'.format(epoch, test_loss.avg)) return test_loss.avg
def train(epoch): model.train() train_loss = AverageMeter() pbar = tqdm(total=len(train_loader)) for batch_idx, (_, response, _, _) in enumerate(train_loader): mb = response.size(0) item_index = torch.arange(num_item).to(device) response = response.to(device) if mb != args.batch_size: pbar.update() continue with torch.no_grad(): item_index = item_index.unsqueeze(0).repeat(mb, 1) item_index[(response == -1).squeeze(2)] = -1 # build what dkvmn_irt expects q_data = item_index.clone() a_data = response.clone().squeeze(2) # ??? https://github.com/ckyeungac/DeepIRT/blob/master/load_data.py qa_data = q_data + a_data * num_item qa_data[(response == -1).squeeze(2)] = -1 # map q_data and qa_data to 0 to N+1 q_data = q_data + 1 qa_data = qa_data + 1 label = response.clone().squeeze(2) optimizer.zero_grad() pred_zs, student_abilities, question_difficulties = \ model(q_data, qa_data, label) loss = model.get_loss( pred_zs, student_abilities, question_difficulties, label, ) loss.backward() # https://github.com/ckyeungac/DeepIRT/blob/master/configs.py nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) optimizer.step() train_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': train_loss.avg}) pbar.close() print('====> Train Epoch: {} Loss: {:.4f}'.format( epoch, train_loss.avg)) return train_loss.avg
def save_json(args, model, reglog, optimizer, loader): pred_label = [] log_top1 = AverageMeter() for iter_epoch, (inp, target) in enumerate(loader): # measure data loading time learning_rate_decay(optimizer, len(loader) * args.epoch + iter_epoch, args.lr) # start at iter start_iter if iter_epoch < args.start_iter: continue # move to gpu inp = inp.cuda(non_blocking=True) target = target.cuda(non_blocking=True) if 'VOC2007' in args.data_path: target = target.float() # forward with torch.no_grad(): output = model(inp) output = reglog(output) _, pred = output.topk(1, 1, True, True) pred = pred.t() pred_var = pred.data.cpu().numpy().reshape(-1) for i in range(len(pred_var)): pred_label.append(pred_var[i]) prec1 = accuracy(args, output, target) log_top1.update(prec1.item(), output.size(0)) def load_json(file_path): assert os.path.exists(file_path), "{} does not exist".format(file_path) with open(file_path, 'r') as fp: data = json.load(fp) img_names = list(data.keys()) return img_names json_predictions,img_names = {}, [] img_names = load_json('./val_targets.json') for idx in range(len(pred_label)): json_predictions[img_names[idx]] = int(pred_label[idx]) output_file = os.path.join(args.json_save_path, args.json_save_name) with open(output_file, 'w') as fp: json.dump(json_predictions, fp) return log_top1.avg
def val(val_loader, model, queue): norm_mut_info = AverageMeter() use_the_queue = False model.eval() end = time.time() with torch.no_grad(): for it, (inputs, labels) in enumerate(val_loader): # normalize the prototypes with torch.no_grad(): w = model.module.prototypes.weight.data.clone() w = nn.functional.normalize(w, dim=1, p=2) model.module.prototypes.weight.copy_(w) # ============ multi-res forward passes ... ============ embedding, output = model(inputs) embedding = embedding.detach() bs = inputs[0].size(0) # ============ swav loss ... ============ loss = 0 for i, crop_id in enumerate(args.crops_for_assign): with torch.no_grad(): out = output[bs * crop_id:bs * (crop_id + 1)].detach() # time to use the queue if queue is not None: if use_the_queue or not torch.all(queue[i, -1, :] == 0): use_the_queue = True out = torch.cat( (torch.mm(queue[i], model.module.prototypes.weight.t()), out)) # fill the queue queue[i, bs:] = queue[i, :-bs].clone() queue[i, :bs] = embedding[crop_id * bs:(crop_id + 1) * bs] # get assignments q = distributed_sinkhorn(out)[-bs:] score, cluster_assignments = q.max(1) cluster_assignments = cluster_assignments.cpu().numpy() nmi = normalized_mutual_info_score(labels.cpu().numpy(), cluster_assignments) # ============ misc ... ============ norm_mut_info.update(nmi) return norm_mut_info.avg
def validate(self, epoch): self.model.eval() val_loss = AverageMeter() val_acc = AverageMeter() val_acc_cls = AverageMeter() val_mean_iu = AverageMeter() # inputs_all, gts_all, predictions_all = [], [], [] for i, (inputs, gts) in enumerate(self.val_loader): N = inputs.size(0) inputs = inputs.to(self.device) gts = gts.to(self.device) # gts = gts.to(self.device, dtype=torch.float32) outputs = self.model(inputs) preds = torch.argmax(outputs, dim=1) # gts = F.upsample(torch.unsqueeze(gts, 0), outputs.size()[2:], mode='nearest') # gts = torch.squeeze(gts, 0).to(torch.int64) val_loss.update(self.criterion(outputs, gts).item(), N) val_metric = evaluate(preds.detach(), gts.detach(), self.num_classes) val_acc.update(val_metric[0]) val_acc_cls.update(val_metric[1]) val_mean_iu.update(val_metric[2]) return val_loss, val_acc, val_acc_cls, val_mean_iu
def train(self, epoch): self.model.train() train_loss = AverageMeter() train_acc = AverageMeter() train_acc_cls = AverageMeter() train_mean_iu = AverageMeter() for i, (inputs, targets) in enumerate(self.train_loader): inputs = inputs.to(self.device) targets = targets.to(self.device) # targets = targets.to(self.device, dtype=torch.float32) self.optim.zero_grad() outputs = self.model(inputs) preds = torch.argmax(outputs, dim=1) # targets = F.upsample(torch.unsqueeze(targets, 0), outputs.size()[2:], mode='nearest') # targets = torch.squeeze(targets, 0).to(torch.int64) loss = self.criterion(outputs, targets) loss.backward() self.optim.step() train_loss.update(loss.item(), inputs.size(0)) train_metric = evaluate(preds.detach(), targets.detach(), self.num_classes) train_acc.update(train_metric[0]) train_acc_cls.update(train_metric[1]) train_mean_iu.update(train_metric[2]) if epoch == 0 and i == 1: print('iteration is started on {}'.format(self.device)) return train_loss, train_acc, train_acc_cls, train_mean_iu
def test(epoch): model.eval() test_loss = AverageMeter() pbar = tqdm(total=len(test_loader)) with torch.no_grad(): for _, response, _, _ in test_loader: mb = response.size(0) item_index = torch.arange(num_item).to(device) response = response.to(device) if mb != args.batch_size: pbar.update() continue with torch.no_grad(): item_index = item_index.unsqueeze(0).repeat(mb, 1) item_index[(response == -1).squeeze(2)] = -1 # build what dkvmn_irt expects q_data = item_index.clone() a_data = response.clone().squeeze(2) # ??? https://github.com/ckyeungac/DeepIRT/blob/master/load_data.py qa_data = q_data + a_data * num_item qa_data[(response == -1).squeeze(2)] = -1 # map q_data and qa_data to 0 to N+1 q_data = q_data + 1 qa_data = qa_data + 1 label = response.clone().squeeze(2) pred_zs, student_abilities, question_difficulties = \ model(q_data, qa_data, label) loss = model.get_loss( pred_zs, student_abilities, question_difficulties, label, ) test_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': test_loss.avg}) pbar.close() print('====> Test Epoch: {} Loss: {:.4f}'.format(epoch, test_loss.avg)) return test_loss.avg
def train(epoch): model.train() train_loss = AverageMeter() pbar = tqdm(total=len(train_loader)) for batch_idx, (_, response, _, mask) in enumerate(train_loader): mb = response.size(0) response = response.to(device) mask = mask.long().to(device) annealing_factor = get_annealing_factor(epoch, batch_idx) optimizer.zero_grad() if args.n_norm_flows > 0: ( response, mask, response_mu, ability_k, ability, ability_mu, ability_logvar, ability_logabsdetjac, item_feat_k, item_feat, item_feat_mu, item_feat_logvar, item_feat_logabsdetjac, ) = model(response, mask) loss = model.elbo( response, mask, response_mu, ability, ability_mu, ability_logvar, item_feat, item_feat_mu, item_feat_logvar, annealing_factor = annealing_factor, use_kl_divergence = False, ability_k = ability_k, item_feat_k = item_feat_k, ability_logabsdetjac = ability_logabsdetjac, item_logabsdetjac = item_feat_logabsdetjac, ) else: outputs = model(response, mask) loss = model.elbo(*outputs, annealing_factor=annealing_factor, use_kl_divergence=True) loss.backward() optimizer.step() train_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': train_loss.avg}) pbar.close() print('====> Train Epoch: {} Loss: {:.4f}'.format(epoch, train_loss.avg)) return train_loss.avg
def val(epoch): model.eval() loss_meter = AverageMeter() with torch.no_grad(): for data in val_loader: batch_size = data.size(0) data = data.to(device) z_mu, z_logvar = model(data) loss = compiled_inference_objective(z, z_mu, z_logvar) loss_meter.update(loss.item(), batch_size) print('====> Test Epoch: {}\tLoss: {:.4f}'.format( epoch, loss_meter.avg)) return loss_meter.avg
def train(model, optimizer, scheduler, criterion, train_loader, run_config): device = torch.device(run_config['device']) for param_group in optimizer.param_groups: current_lr = param_group['lr'] model.train() loss_meter = AverageMeter() accuracy_meter = AverageMeter() start = time.time() for step, (data, targets) in enumerate(train_loader): if torch.cuda.device_count() == 1: data = data.to(device) targets = targets.to(device) optimizer.zero_grad() outputs = model(data) loss = criterion(outputs, targets) loss.backward() optimizer.step() loss_ = loss.item() num = data.size(0) accuracy = utils.accuracy(outputs, targets)[0].item() loss_meter.update(loss_, num) accuracy_meter.update(accuracy, num) if scheduler is not None: scheduler.step() elapsed = time.time() - start train_log = collections.OrderedDict({ 'loss': loss_meter.avg, 'accuracy': accuracy_meter.avg, 'time': elapsed }) return train_log
def validate_network(val_loader, model, linear_classifier): batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top2 = AverageMeter() global best_acc # switch to evaluate mode model.eval() linear_classifier.eval() criterion = nn.CrossEntropyLoss().cuda() with torch.no_grad(): end = time.perf_counter() for i, (inp, target) in enumerate(val_loader): # move to gpu inp = inp.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = linear_classifier(model(inp)) loss = criterion(output, target) acc1, acc2 = accuracy(output, target, topk=(1, 2)) losses.update(loss.item(), inp.size(0)) top2.update(acc2[0], inp.size(0)) top1.update(acc1[0], inp.size(0)) # measure elapsed time batch_time.update(time.perf_counter() - end) end = time.perf_counter() if top1.avg.item() > best_acc: best_acc = top1.avg.item() if args.rank == 0: logger.info("Test:\t" "Time {batch_time.avg:.3f}\t" "Loss {loss.avg:.4f}\t" "Acc@1 {top1.avg:.3f}\t" "Acc@2 {top2.avg:.3f}" "Best Acc@1 so far {acc:.1f}".format(batch_time=batch_time, loss=losses, top1=top1, top2=top2, acc=best_acc)) return losses.avg, top1.avg.item(), top2.avg.item()
def test(epoch): model.eval() test_loss = AverageMeter() pbar = tqdm(total=len(test_loader)) with torch.no_grad(): for _, response, _, mask in test_loader: mb = response.size(0) response = response.to(device) mask = mask.long().to(device) loss = svi.evaluate_loss(response, mask) test_loss.update(loss, mb) pbar.update() pbar.set_postfix({'Loss': test_loss.avg}) pbar.close() print('====> Test Epoch: {} Loss: {:.4f}'.format(epoch, test_loss.avg)) return test_loss.avg
def train(epoch): model.train() train_loss = AverageMeter() pbar = tqdm(total=len(train_loader)) for batch_idx, (_, response, _, mask) in enumerate(train_loader): mb = response.size(0) response = response.to(device) mask = mask.long().to(device) annealing_factor = get_annealing_factor(epoch, batch_idx) loss = svi.step(response, mask, annealing_factor) train_loss.update(loss, mb) pbar.update() pbar.set_postfix({'Loss': train_loss.avg}) pbar.close() print('====> Train Epoch: {} Loss: {:.4f}'.format( epoch, train_loss.avg)) return train_loss.avg
def test(epoch): model.eval() test_loss = AverageMeter() pbar = tqdm(total=len(test_loader)) with torch.no_grad(): for index, response, _, mask in test_loader: mb = response.size(0) index = index.to(device) response = response.to(device) mask = mask.long().to(device) response_mu = model(index, response, mask) loss = F.binary_cross_entropy(response_mu, response.float()) test_loss.update(loss.item(), mb) pbar.update() pbar.set_postfix({'Loss': test_loss.avg}) pbar.close() print('====> Test Epoch: {} Loss: {:.4f}'.format(epoch, test_loss.avg)) return test_loss.avg
def step(self): with torch.no_grad(): stats = AverageMeter() weight_decays = [] for group in self.optim.param_groups: # -- takes weight decay control from wrapped optimizer weight_decay = group[ 'weight_decay'] if 'weight_decay' in group else 0 weight_decays.append(weight_decay) # -- user wants to exclude this parameter group from LARS # adaptation if ('LARS_exclude' in group) and group['LARS_exclude']: continue group['weight_decay'] = 0 for p in group['params']: if p.grad is None: continue param_norm = torch.norm(p.data) grad_norm = torch.norm(p.grad.data) if param_norm != 0 and grad_norm != 0: adaptive_lr = self.trust_coefficient * (param_norm) / ( grad_norm + param_norm * weight_decay + self.eps) stats.update(adaptive_lr) p.grad.data += weight_decay * p.data p.grad.data *= adaptive_lr self.optim.step() # -- return weight decay control to wrapped optimizer for i, group in enumerate(self.optim.param_groups): group['weight_decay'] = weight_decays[i] return stats
def train(epoch): model.train() loss_meter = AverageMeter() for batch_idx, data_list in enumerate(train_loader): x_list = [data[0] for data in data_list] batch_size = len(x_list[0]) loss = 0 for i in range(n_planes): x_i = x_list[i] x_i = x_i.to(device) context_x_i, context_z_i = sample_minibatch( train_datasets[i], batch_size, args.n_mlp_samples) context_x_i = context_x_i.to(device) context_z_i = context_z_i.to(device) context_x_z_i = torch.cat([context_x_i, context_z_i], dim=2) z_mu_i, z_logvar_i = model(x_i, context_x_z_i) loss_i = compiled_inference_objective(z_i, z_mu_i, z_logvar_i) loss += loss_i loss_meter.update(loss.item(), batch_size) optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * batch_size, len(train_loader.dataset), 100. * batch_idx / len(train_loader), -loss_meter.avg)) print('====> Train Epoch: {}\tLoss: {:.4f}'.format( epoch, -loss_meter.avg)) return loss_meter.avg
def train(epoch): model.train() loss_meter = AverageMeter() for batch_idx, data in enumerate(train_loader): batch_size = data.size(0) data = data.to(device) z_mu, z_logvar = model(data) loss = compiled_inference_objective(z, z_mu, z_logvar) loss_meter.update(loss.item(), batch_size) optimizer.zero_grad() loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * batch_size, len(train_loader.dataset), 100. * batch_idx / len(train_loader), -loss_meter.avg)) print('====> Train Epoch: {}\tLoss: {:.4f}'.format( epoch, loss_meter.avg)) return loss_meter.avg
def val_epoch(epoch, data_loader, model, criterion, opt, logger): print('\t************** VALIDATION **************') model.eval() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() accuracies = AverageMeter() end_time = time.time() for i, (inputs, targets) in enumerate(data_loader): data_time.update(time.time() - end_time) if not opt.no_cuda: targets = targets.cuda(async=True) inputs = Variable(inputs, volatile=True) targets = Variable(targets, volatile=True) outputs = model(inputs) loss = criterion(outputs, targets) acc = calculate_accuracy(outputs, targets) losses.update(loss.item(), inputs.size(0)) accuracies.update(acc, inputs.size(0)) batch_time.update(time.time() - end_time) end_time = time.time() print('\tBatch: [{0}/{1}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Acc {acc.val:.3f} ({acc.avg:.3f})'.format(i + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses, acc=accuracies)) logger.log({'epoch': epoch, 'loss': losses.avg, 'acc': accuracies.avg}) return accuracies.avg
def train(train_loader, model, optimizer, epoch, lr_schedule, queue, args): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() softmax = nn.Softmax(dim=1).cuda() model.train() end = time.time() for it, inputs in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) # update learning rate iteration = epoch * len(train_loader) + it for param_group in optimizer.param_groups: param_group["lr"] = lr_schedule[iteration] # normalize the prototypes with torch.no_grad(): w = model.module.prototypes.weight.data.clone() w = nn.functional.normalize(w, dim=1, p=2) model.module.prototypes.weight.copy_(w) # ============ data split =========== inputs, target = inputs # ============ multi-res forward passes ... ============ embedding, output = model(inputs) embedding = embedding.detach() bs = inputs[0].size(0) # ============ EMA class-wise feature vector ========== for b in range(bs): queue[target[b]] = queue[target[b]] * 0.99 + ( embedding[b] + embedding[bs + b]) * 0.01 / 2 queue = nn.functional.normalize(queue, dim=1, p=2) dist.all_reduce(queue) queue /= args.world_size queue = nn.functional.normalize(queue, dim=1, p=2) # ============ swav loss ... ============ loss = 0 with torch.no_grad(): q = torch.mm(queue, model.module.prototypes.weight.t()) q = q / args.epsilon if args.improve_numerical_stability: M = torch.max(q) dist.all_reduce(M, op=dist.ReduceOp.MAX) q -= M q = torch.exp(q).t() q = sinkhorn(q, args.sinkhorn_iterations) # q = distributed_sinkhorn(q, args.sinkhorn_iterations) # match q /w label (1000, num_p) --> (bsz, num_p) for b in range(bs): if b == 0: matched_q = q[target[b]].unsqueeze(0) else: matched_q = torch.cat([matched_q, q[target[b]].unsqueeze(0)], 0) # cluster assignment prediction subloss = 0 for v in np.arange(np.sum(args.nmb_crops)): p = softmax(output[bs * v:bs * (v + 1)] / args.temperature) subloss -= torch.mean(torch.sum(matched_q * torch.log(p), dim=1)) loss += subloss / np.sum(args.nmb_crops) # ============ backward and optim step ... ============ optimizer.zero_grad() if args.use_fp16: with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # cancel some gradients if iteration < args.freeze_prototypes_niters: for name, p in model.named_parameters(): if "prototypes" in name: p.grad = None optimizer.step() # ============ misc ... ============ losses.update(loss.item(), inputs[0].size(0)) batch_time.update(time.time() - end) end = time.time() if args.rank == 0 and it % 50 == 0: logger.info("Epoch: [{0}][{1}]\t" "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" "Loss {loss.val:.4f} ({loss.avg:.4f})\t" "Lr: {lr:.4f}".format( epoch, it, batch_time=batch_time, data_time=data_time, loss=losses, lr=optimizer.optim.param_groups[0]["lr"], )) return (epoch, losses.avg), queue
def train_loc_model(model, data_loaders, optimizer, scheduler, seg_loss, num_epochs, weight_dir, snapshot_name, log_dir, best_score=0): writer = SummaryWriter(log_dir + 'localization') print('Tensorboard is recording into folder: ' + log_dir + 'localization') torch.cuda.empty_cache() for epoch in range(num_epochs): losses = AverageMeter() dices = AverageMeter() iterator = data_loaders['train'] iterator = tqdm(iterator) model.train() for i, sample in enumerate(iterator): imgs = sample["img"].cuda(non_blocking=True) msks = sample["msk"].cuda(non_blocking=True) out = model(imgs) loss = seg_loss(out, msks) with torch.no_grad(): _probs = torch.sigmoid(out[:, 0, ...]) dice_sc = 1 - dice_round(_probs, msks[:, 0, ...]) losses.update(loss.item(), imgs.size(0)) dices.update(dice_sc, imgs.size(0)) iterator.set_description("Epoch {}/{}, lr {:.7f}; Loss {loss.val:.4f} ({loss.avg:.4f}); Dice {dice.val:.4f} ({dice.avg:.4f})".format( epoch, num_epochs, scheduler.get_lr()[-1], loss=losses, dice=dices)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.999) optimizer.step() writer.add_scalar('Train/Loss', losses.avg, epoch) writer.add_scalar('Train/Dice', dices.avg, epoch) writer.flush() if epoch % 2 == 0: torch.cuda.empty_cache() model = model.eval() dices0 = [] _thr = 0.5 iterator = data_loaders['val'] iterator = tqdm(iterator) with torch.no_grad(): for i, sample in enumerate(iterator): msks = sample["msk"].numpy() imgs = sample["img"].cuda(non_blocking=True) out = model(imgs) msk_pred = torch.sigmoid(out[:, 0, ...]).cpu().numpy() for j in range(msks.shape[0]): dices0.append(dice(msks[j, 0], msk_pred[j] > _thr)) d = np.mean(dices0) writer.add_scalar('Val/Dice', d, epoch) writer.flush() print("Val Dice: {}".format(d)) if d > best_score: best_score = d torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_score': d, }, path.join(weight_dir, snapshot_name + '_best')) print("score: {}\tscore_best: {}".format(d, best_score)) writer.close() return best_score
def train_cls_model(model, data_loaders, optimizer, scheduler, seg_loss, ce_loss, num_epochs, weight_dir, snapshot_name, log_dir, best_score=0): torch.cuda.empty_cache() writer = SummaryWriter(log_dir + 'classification') print('Tensorboard is recording into folder: ' + log_dir + 'classification') for epoch in range(num_epochs): losses = AverageMeter() dices = AverageMeter() iterator = data_loaders['train'] iterator = tqdm(iterator) model.train() for i, sample in enumerate(iterator): imgs = sample["img"].cuda(non_blocking=True) msks = sample["msk"].cuda(non_blocking=True) lbl_msk = sample["lbl_msk"].cuda(non_blocking=True) out = model(imgs) loss_loc = seg_loss(out[:, 0, ...], msks[:, 0, ...]) loss1 = seg_loss(out[:, 1, ...], msks[:, 1, ...]) loss2 = seg_loss(out[:, 2, ...], msks[:, 2, ...]) loss3 = seg_loss(out[:, 3, ...], msks[:, 3, ...]) loss4 = seg_loss(out[:, 4, ...], msks[:, 4, ...]) loss5 = ce_loss(out, lbl_msk) loss = 0.1 * loss_loc + 0.1 * loss1 + 0.3 * loss2 + 0.3 * loss3 + 0.2 * loss4 + loss5 * 11 with torch.no_grad(): _probs = torch.sigmoid(out[:, 0, ...]) dice_sc = 1 - dice_round(_probs, msks[:, 0, ...]) losses.update(loss.item(), imgs.size(0)) dices.update(dice_sc, imgs.size(0)) iterator.set_description("Epoch {}/{}, lr {:.7f}; Loss {loss.val:.4f} ({loss.avg:.4f}); Dice {dice.val:.4f} ({dice.avg:.4f})".format( epoch, num_epochs, scheduler.get_lr()[-1], loss=losses, dice=dices)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.999) optimizer.step() writer.add_scalar('Train/Loss', losses.avg, epoch) writer.add_scalar('Train/Dice', dices.avg, epoch) writer.add_scalar('Train/Loc_loss', loss_loc, epoch) writer.add_scalar('Train/NoDamage_loss', loss1, epoch) writer.add_scalar('Train/MinorDamage_loss', loss2, epoch) writer.add_scalar('Train/MajorDamage_loss', loss3, epoch) writer.add_scalar('Train/Destroyed_loss', loss4, epoch) writer.add_scalar('Train/Cls_loss', loss4, epoch) writer.flush() if epoch % 2 == 0: torch.cuda.empty_cache() model = model.eval() dices0 = [] tp = np.zeros((4,)) fp = np.zeros((4,)) fn = np.zeros((4,)) _thr = 0.3 iterator = data_loaders['val'] iterator = tqdm(iterator) with torch.no_grad(): for i, sample in enumerate(iterator): msks = sample["msk"].numpy() lbl_msk = sample["lbl_msk"].numpy() imgs = sample["img"].cuda(non_blocking=True) out = model(imgs) msk_pred = torch.sigmoid(out[:, 0, ...]).cpu().numpy() msk_damage_pred = torch.sigmoid(out).cpu().numpy()[:, 1:, ...] for j in range(msks.shape[0]): dices0.append(dice(msks[j, 0], msk_pred[j] > _thr)) targ = lbl_msk[j][msks[j, 0] > 0] pred = msk_damage_pred[j].argmax(axis=0) pred = pred * (msk_pred[j] > _thr) pred = pred[msks[j, 0] > 0] for c in range(4): tp[c] += np.logical_and(pred == c, targ == c).sum() fn[c] += np.logical_and(pred != c, targ == c).sum() fp[c] += np.logical_and(pred == c, targ != c).sum() d0 = np.mean(dices0) f1_sc = np.zeros((4,)) for c in range(4): f1_sc[c] = 2 * tp[c] / (2 * tp[c] + fp[c] + fn[c]) f1 = 4 / np.sum(1.0 / (f1_sc + 1e-6)) sc = 0.3 * d0 + 0.7 * f1 print("Val Score: {}, Dice: {}, F1: {}, F1_no-damage: {}, F1_minor-damage: {}, F1_major-damage: {}, F1_destroyed: {}".format( sc, d0, f1, f1_sc[0], f1_sc[1], f1_sc[2], f1_sc[3])) writer.add_scalar('Val/Score', sc, epoch) writer.add_scalar('Val/Dice', d0, epoch) writer.add_scalar('Val/NoDamage_F1', f1, epoch) writer.add_scalar('Val/MinorDamage_F1', f1_sc[0], epoch) writer.add_scalar('Val/MajorDamage_F1', f1_sc[1], epoch) writer.add_scalar('Val/Destroyed_F1', f1_sc[2], epoch) writer.add_scalar('Val/Cls_F1', f1_sc[3], epoch) writer.flush() if sc > best_score: torch.save({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_score': sc, }, path.join(weight_dir, snapshot_name + '_best')) best_score = sc print("score: {}\tscore_best: {}".format(sc, best_score)) writer.close() return best_score
def train(loader, model, optimizer, epoch, schedule, local_memory_index, local_memory_embeddings): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() model.train() cross_entropy = nn.CrossEntropyLoss(ignore_index=-100) assignments = cluster_memory(model, local_memory_index, local_memory_embeddings, len(loader.dataset)) logger.info('Clustering for epoch {} done.'.format(epoch)) end = time.time() start_idx = 0 for it, (idx, inputs) in enumerate(loader): # measure data loading time data_time.update(time.time() - end) # update learning rate iteration = epoch * len(loader) + it for param_group in optimizer.param_groups: param_group["lr"] = schedule[iteration] # ============ multi-res forward passes ... ============ emb, output = model(inputs) emb = emb.detach() bs = inputs[0].size(0) # ============ deepcluster-v2 loss ... ============ loss = 0 for h in range(len(args.nmb_prototypes)): scores = output[h] / args.temperature targets = assignments[h][idx].repeat(sum( args.nmb_crops)).cuda(non_blocking=True) loss += cross_entropy(scores, targets) loss /= len(args.nmb_prototypes) # ============ backward and optim step ... ============ optimizer.zero_grad() loss.backward() # cancel some gradients if iteration < args.freeze_prototypes_niters: for name, p in model.named_parameters(): if "prototypes" in name: p.grad = None optimizer.step() # ============ update memory banks ... ============ local_memory_index[start_idx:start_idx + bs] = idx for i, crop_idx in enumerate(args.crops_for_assign): local_memory_embeddings[i][start_idx : start_idx + bs] = \ emb[crop_idx * bs : (crop_idx + 1) * bs] start_idx += bs # ============ misc ... ============ losses.update(loss.item(), inputs[0].size(0)) batch_time.update(time.time() - end) end = time.time() if args.rank == 0 and it % 50 == 0: logger.info("Epoch: [{0}][{1}]\t" "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" "Loss {loss.val:.4f} ({loss.avg:.4f})\t" "Lr: {lr:.4f}".format( epoch, it, batch_time=batch_time, data_time=data_time, loss=losses, lr=optimizer.optim.param_groups[0]["lr"], )) return (epoch, losses.avg), local_memory_index, local_memory_embeddings