def extract_feature(model, data_loader, use_gpu, print_freq=1, metric=None): ''' :param model: the model has been trained :param data_loader: query_loader / gallery_loader :param print_freq: default 1 :param use_gpu: delivered from main.py :param metric: :return: feature & pids & camids ''' model.eval() batch_time = AverageMeter() features, pids, camids = [], [], [] for batch_idx, (imgs, pids, camids) in enumerate(data_loader): if use_gpu: imgs = imgs.cuda() end = time.time() features = model(imgs) # resnet50: f batch_time.update(time.time() - end) features = features.data.cpu() features.append(features) pids.extend(pids) camids.extend(camids) features = torch.cat(features, 0) pids = np.asarray(pids) camids = np.asarray(camids) print("Extracted features for {} set, obtained {}-by-{} matrix".format( data_loader, features.size(0), features.size(1))) return features, pids, camids
def train(train_loader, model, criterion, optimizer, epoch): global opt losses = AverageMeter() # switch to train mode model.train() criterion.train() step = epoch * len(train_loader) pred_gt_same = [] for i, (box, cls, feature, lfeat, lrel, sents, sents_gt, gt_boxes, img_ids, sent_ids) in enumerate(train_loader): step += 1 if opt['gpus'] is not None: box = box.cuda() cls = cls.cuda() feature = feature.cuda() lfeat = lfeat.cuda() lrel = lrel.cuda() sents = sents.cuda() sents_gt = sents_gt.cuda() # compute output score = model(feature, cls, lfeat, lrel, sents) loss, score = criterion(score, box, cls, sents_gt) losses.update(loss.item()) cls = to_numpy(cls) final_score = to_numpy(score.detach()) final_score[cls == -1] = -999 pred_ind = np.argmax(final_score, 1) sents_gt = to_numpy(sents_gt) for j in range(pred_ind.size): if sents_gt[j] == pred_ind[j]: pred_gt_same.append(1) else: pred_gt_same.append(0) # compute gradient and do Adam step optimizer.zero_grad() loss.backward() clip_gradient(optimizer, opt['grad_clip']) optimizer.step() if i % args.print_freq == 0: if i != 0: same = np.sum( pred_gt_same[-args.print_freq * opt['batch_size']:]) / float( args.print_freq * opt['batch_size']) print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec {same:.4f}'.format(epoch, i, len(train_loader), loss=losses, same=same))
def train_model_for_epoch(model, train_dataloader, loss_fn, optimizer, epoch): """ train model for a epoch :param model: :param train_dataloader: :param loss_fn: :param optimizer: :return: """ # prepared model.train() num_iter = len(train_dataloader) if cfg.max_iter <= 0 else min(len(train_dataloader), cfg.max_iter) loss_states = loss_fn.get_loss_states() data_time, batch_time = AverageMeter(), AverageMeter() running_loss = AverageMeter() avg_loss_states = {l: AverageMeter() for l in loss_states} start = time.time() last = time.time() phase = "train" # foreach the images for iter_id, train_data in enumerate(train_dataloader): if iter_id >= num_iter: break # load data time data_time.update(time.time() - last) inputs, targets, infos = train_data # to device inputs = inputs.to(device) # forward the models and loss outputs = model(inputs) loss, loss_stats = loss_fn(outputs, targets) # update the weights optimizer.zero_grad() loss.backward() optimizer.step() # network time and update time batch_time.update(time.time() - last) last = time.time() # handle the log and accumulate the loss # logger.open_summary_writer() log_item = '{phase} per epoch: [{0}][{1}/{2}]|Tot: {total:} '.format( epoch, iter_id, num_iter, phase=phase, total=last - start) for l in avg_loss_states: if l in loss_stats: avg_loss_states[l].update( loss_stats[l].item(), inputs.size(0)) log_item = log_item + '|{}:{:.4f}'.format(l, avg_loss_states[l].avg) # logger.scalar_summary('{phase}/epoch/{}'.format(l, phase=phase), avg_loss_states[l].avg, epoch* num_iter + iter_id) # logger.close_summary_writer() running_loss.update(loss.item(), inputs.size(0)) log_item = log_item + '|Data {dt.val:.3f}s({dt.avg:.3f}s) ' \ '|Net {bt.avg:.3f}s'.format(dt=data_time, bt=batch_time) logger.write(log_item, level=1) del inputs, loss torch.cuda.empty_cache() if (iter_id + 1) % cfg.save_span == 0: executor.submit(save_checkpoint, model.state_dict(), epoch, running_loss.avg, data_cfg.save_dir, iter_id) return running_loss, avg_loss_states
def validate(val_loader, model, criterion, epoch=-1): global opt losses = AverageMeter() # switch to eval mode model.eval() criterion.eval() pred_gt_same = [] with torch.no_grad(): for i, (box, cls, feature, lfeat, lrel, sents, sents_gt, gt_boxes, img_ids, sent_ids) in enumerate(val_loader): if opt['gpus'] is not None: box = box.cuda() cls = cls.cuda() feature = feature.cuda() lfeat = lfeat.cuda() lrel = lrel.cuda() sents = sents.cuda() sents_gt = sents_gt.cuda() # compute output score = model(feature, cls, lfeat, lrel, sents) loss, score = criterion(score, box, cls, sents_gt) losses.update(loss.item()) cls = to_numpy(cls) final_score = to_numpy(score.detach()) final_score[cls == -1] = -999 pred_ind = np.argmax(final_score, 1) sents_gt = to_numpy(sents_gt) for j in range(pred_ind.size): if sents_gt[j] == pred_ind[j]: pred_gt_same.append(1) else: pred_gt_same.append(0) if i % args.print_freq == 0: if i != 0: same = np.sum( pred_gt_same[-args.print_freq * opt['batch_size']:]) / float( args.print_freq * opt['batch_size']) print('Epoch: [{0}][{1}/{2}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Prec {same:.4f}'.format(epoch, i, len(val_loader), loss=losses, same=same)) same = np.sum(pred_gt_same) / float(len(pred_gt_same)) print('Epoch: [{0}]\t' 'Loss {1:.4f}\t' 'Prec {2:.4f}'.format(epoch, losses.avg, same)) return losses.avg, same
def train(self, train_loader, val_loader, optimizer, lr_scheduler, tb_logger): print_freq = self.config.logging.print_freq batch_time = AverageMeter(print_freq) data_time = AverageMeter(print_freq) loss_meter = [AverageMeter(print_freq) for _ in range(3)] top1_meter = [AverageMeter(print_freq) for _ in range(3)] top5_meter = [AverageMeter(print_freq) for _ in range(3)] # track stats of architecture parameter arch_loss_meter = AverageMeter(print_freq) floss_meter = AverageMeter(print_freq) eflops_meter = AverageMeter(print_freq) arch_top1_meter = AverageMeter(print_freq) meters = [ top1_meter, top5_meter, loss_meter, arch_loss_meter, floss_meter, eflops_meter, arch_top1_meter, data_time ] criterions = self._get_criterion() self._sample_width() end = time.time() for e in range(self.cur_epoch, self.config.training.epoch): # train self.model.train() if self.config.distributed.enable: train_loader.sampler.set_epoch(e) for batch_idx, (x, y) in enumerate(train_loader): x, y = x.cuda(), y.cuda() self._train_one_batch(x, y, optimizer, lr_scheduler, meters, criterions, end) batch_time.update(time.time() - end) end = time.time() cur_lr = lr_scheduler[0].get_lr()[0] cur_arch_lr = lr_scheduler[1].get_lr()[0] # logging self._logging(tb_logger, e, batch_idx, len(train_loader), meters + [batch_time], [cur_lr, cur_arch_lr]) # validation self.validate(val_loader, train_loader, self.config.validation.width, tb_logger=tb_logger) self.save(optimizer, e) # sample model self._info('sampling model...') dmcp_utils.sample_model(self.config, self.model) self._info('draw layer flops distribution...') dmcp_utils.layer_flops_distribution(self.config, self.model)
def train(self, train_loader, val_loader, optimizer, lr_scheduler, tb_logger): print_freq = self.config.logging.print_freq flops = calc_adaptive_model_flops(self.model, self.config.dataset.input_size) params = calc_model_parameters(self.model) self._info('flops: {}, params: {}'.format(flops, params)) # meters batch_time = AverageMeter(print_freq) data_time = AverageMeter(print_freq) loss_meter = AverageMeter(print_freq) top1_meter = AverageMeter(print_freq) top5_meter = AverageMeter(print_freq) meters = [top1_meter, top5_meter, loss_meter, data_time] criterion = self._get_criterion() end = time.time() for e in range(self.cur_epoch, self.config.training.epoch): # train if self.config.distributed.enable: train_loader.sampler.set_epoch(e) for batch_idx, (x, y) in enumerate(train_loader): self.model.train() x, y = x.cuda(), y.cuda() self._train_one_batch(x, y, optimizer, lr_scheduler, meters, [criterion], end) batch_time.update(time.time() - end) end = time.time() cur_lr = lr_scheduler.get_lr()[0] self._logging(tb_logger, e, batch_idx, len(train_loader), meters + [batch_time], cur_lr) # validation if self.cur_step >= self.config.validation.start_val and self.cur_step % self.config.validation.val_freq == 0: best_top1 = self.best_top1 self.validate(val_loader, tb_logger=tb_logger) save_file = self.save(optimizer, e, best_top1=self.best_top1) if self.best_top1 > best_top1: from shutil import copyfile best_file_dir = os.path.join(self.config.save_path, 'best') if not os.path.exists(best_file_dir): os.makedirs(best_file_dir) best_file = os.path.join(best_file_dir, 'best.pth') copyfile(save_file, best_file)
def validate(val_loader, model, criterion, rank, args, logger, cfg): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for iter, valdata in enumerate(val_loader): data, label, _, meta= valdata if isinstance(data, (list,)): for i in range(len(data)): data[i] = data[i].cuda(rank) else: data = data.cuda(rank) label = label.cuda(rank) for key, val in meta.items(): if isinstance(val, (list,)): for i in range(len(val)): val[i] = val[i].cuda(rank) else: meta[key] = val.cuda(rank) # compute output if cfg.DETECTION.ENABLE: output = model(data, meta["boxes"]) else: output = model(data) loss = criterion(output, label) # measure accuracy and record loss acc1, acc5 = topks_correct(output, label, (1, 5)) # torch.distributed.barrier() reduced_loss = reduce_mean(loss, args.nprocs) reduced_acc1 = reduce_mean(acc1, args.nprocs) reduced_acc5 = reduce_mean(acc5, args.nprocs) losses.update(reduced_loss.item(), data[0].size(0)) top1.update(reduced_acc1.item(), data[0].size(0)) top5.update(reduced_acc5.item(), data[0].size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if iter % args.print_freq == 0: val_message = progress.display(iter) logger.info('Val Phase:' + val_message) # TODO: this should also be done with the ProgressMeter logger.info(' * Val Acc@1 {top1.avg:.3f} Val Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) return top1.avg
def validate(self, val_loader, tb_logger=None): batch_time = AverageMeter(0) loss_meter = AverageMeter(0) top1_meter = AverageMeter(0) top5_meter = AverageMeter(0) self.model.eval() criterion = nn.CrossEntropyLoss() end = time.time() with torch.no_grad(): for batch_idx, (x, y) in enumerate(val_loader): x, y = x.cuda(), y.cuda() num = x.size(0) out = self.model(x) loss = criterion(out, y) top1, top5 = accuracy(out, y, top_k=(1, 5)) loss_meter.update(loss.item(), num) top1_meter.update(top1.item(), num) top5_meter.update(top5.item(), num) batch_time.update(time.time() - end) end = time.time() if batch_idx % self.config.logging.print_freq == 0: self._info( 'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})' .format(batch_idx, len(val_loader), batch_time=batch_time)) total_num = torch.tensor([loss_meter.count]).cuda() loss_sum = torch.tensor([loss_meter.avg * loss_meter.count]).cuda() top1_sum = torch.tensor([top1_meter.avg * top1_meter.count]).cuda() top5_sum = torch.tensor([top5_meter.avg * top5_meter.count]).cuda() dist.all_reduce(total_num) dist.all_reduce(loss_sum) dist.all_reduce(top1_sum) dist.all_reduce(top5_sum) val_loss = loss_sum.item() / total_num.item() val_top1 = top1_sum.item() / total_num.item() val_top5 = top5_sum.item() / total_num.item() self._info( 'Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.format( val_top1, val_top5, val_loss, loss_meter.count)) if dist.is_master(): if val_top1 > self.best_top1: self.best_top1 = val_top1 if tb_logger is not None: tb_logger.add_scalar('loss_val', val_loss, self.cur_step) tb_logger.add_scalar('acc1_val', val_top1, self.cur_step) tb_logger.add_scalar('acc5_val', val_top5, self.cur_step)
def train(train_loader, model, criterion, optimizer, epoch, cfgs): logger = logging.getLogger('{}.train'.format(cfgs['log_name'])) batch_time = AverageMeter('Time', ':6.3f') data_time = AverageMeter('Data', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(train_loader), [batch_time, data_time, losses, top1, top5], prefix="Epoch: [{}]".format(epoch)) # switch to train mode model.train() end = time.time() for i, (images, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % cfgs['print_freq'] == 0: logger.info(progress.display(i))
def train(train_source_iter, train_target_iter, classifier, domain_adv, optimizer, lr_schedule, epoch, device, args): batch_time = AverageMeter('Time', ':5.2f') data_time = AverageMeter('Data', ':5.2f') losses = AverageMeter('Loss', ':6.2f') cls_accs = AverageMeter('Cls Acc', ':3.1f') domain_accs = AverageMeter('Domain Acc', ':3.1f') progress = ProgressMeter( args.iter_per_epoch, [batch_time, data_time, losses, cls_accs, domain_accs], prefix='Epoch: [{}]'.format(epoch)) classifier.train() end = time.time() for i in range(args.iter_per_epoch): x_s, label_s = next(train_source_iter) x_t, _ = next(train_target_iter) x_s, x_t = x_s.to(device), x_t.to(device) label_s = label_s.to(device) data_time.update(time.time() - end) #前向传播,计算loss很关键 x = torch.cat((x_s, x_t), dim=0) y, f = classifier(x) y_s, y_t = y.chunk(2, dim=0) f_s, f_t = f.chunk(2, dim=0) cls_loss = F.cross_entropy(y_s, label_s) ## 重点学习对抗的loss adv_loss = domain_adv(f_s, f_t) loss_total = cls_loss + args.trade_off * adv_loss ## 各种指标更新 cls_acc = accuracy(y_s, label_s)[0] domain_acc = domain_adv.domain_discriminator_accuracy losses.update(loss_total.item(), x_s.size(0)) cls_accs.update(cls_acc.item(), x_s.size(0)) domain_accs.update(domain_acc.item(), x_s.size(0)) ## 反向传播 optimizer.zero_grad() loss_total.backward() optimizer.step() ## 更新 lr_schedule.step() batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i)
def train(self, train_dataloader: DataLoader, val_dataloader: DataLoader, epoches=100): best_score = 0 early_n = 0 for epo in range(epoches): step_n = 0 train_avg_loss = AverageMeter() train_data_iter = tqdm.tqdm(train_dataloader) for batch in train_data_iter: self.model.train() batch_data = self._batch_trans(batch) train_loss = self.step(step_n, batch_data) train_avg_loss.update(train_loss.item(), 1) status = '[{0}] lr = {1:.7f} batch_loss = {2:.3f} avg_loss = {3:.3f} '.format( epo + 1, self.scheduler.get_lr()[0], train_loss.item(), train_avg_loss.avg) #if step_n%self.log_steps ==0: # print(status) train_data_iter.set_description(status) step_n += 1 if self.global_step % self.val_steps == 0: ## val m = self.val(val_dataloader) acc = m['acc'] if best_score < acc: early_n = 0 best_score = acc model_path = os.path.join(self.save_dir, 'best.pth') torch.save(self.model.state_dict(), model_path) else: early_n += 1 self.logger.write("steps: {} ,mean ap : {:.4f} , best ap: {:.4f}". \ format(self.global_step, acc, best_score)) self.logger.write(str(m)) self.logger.write("==" * 50) if early_n > self.early_stop_n: return best_score return best_score
def validate(val_loader, model, criterion, cfgs): logger = logging.getLogger('{}.validate'.format(cfgs['log_name'])) batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(val_loader), [batch_time, losses, top1, top5], prefix='Test: ') # switch to evaluate mode model.eval() with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(val_loader): images = images.cuda(non_blocking=True) target = target.cuda(non_blocking=True) # compute output output = model(images) loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), images.size(0)) top1.update(acc1[0], images.size(0)) top5.update(acc5[0], images.size(0)) # measure elapsed time batch_time.update(time.time() - end) end = time.time() batch_time.update(time.time() - end) end = time.time() if i % cfgs['print_freq'] == 0: logger.info(progress.display(i)) # TODO: this should also be done with the ProgressMeter logger.info(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format( top1=top1, top5=top5)) return top1.avg
def validate(model, loss_func, val_loader): losses = AverageMeter('Loss', ':4.4f') top1 = AverageMeter('Acc@1', ':4.2f') top5 = AverageMeter('Acc@5', ':4.2f') progress = ProgressMeter(len(val_loader), [losses, top1, top5], prefix='Validation: ') model.eval() with torch.no_grad(): for i, data in enumerate(val_loader, 1): imgs = data['image'].to(device) label = data['label'].to(device) out = model(imgs) loss = loss_func(out, label) acc1, acc5 = accuracy(out, label, topk=(1, 5)) losses.update(loss.item()) top1.update(acc1[0]) top5.update(acc5[0]) progress.display(i) return losses.avg
def train(model, optimizer, loss_func, train_loader, epoch): losses = AverageMeter('Loss', ':4.4f') top1 = AverageMeter('Acc@1', ':4.2f') top5 = AverageMeter('Acc@5', ':4.2f') progress = ProgressMeter(len(train_loader), [losses, top1, top5], prefix="Epoch: [{}]".format(epoch + 1)) model.train() for i, data in enumerate(train_loader, 1): imgs = data['image'].to(device) label = data['label'].to(device) out = model(imgs) loss = loss_func(out, label) acc1, acc5 = accuracy(out, label, topk=(1, 5)) losses.update(loss.item()) top1.update(acc1[0]) top5.update(acc5[0]) optimizer.zero_grad() loss.backward() optimizer.step() if i % PRINT_FREQ == 0: progress.display(i) timer = time() - TIME print("Total time Elapsed (H:m:s):", timedelta(seconds=timer))
def train_once(runner): for epoch in range(runner.train_epochs): losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() runner.model.train() for i, data in enumerate(runner.tr_loader): images, labels = data if conf.get()['cuda']['avail']: images, labels = images.to(runner.device), labels.to( runner.device) runner.model = runner.model.to(runner.device) runner.optimizer.zero_grad() outputs, loss = runner.regularizer(images, labels) loss.backward() runner.optimizer.step() ttop1, ttop5 = runner.accuracy(outputs, labels, (1, 5)) losses.update(loss.item(), images.size(0)) top1.update(ttop1.item(), images.size(0)) top5.update(ttop5.item(), images.size(0)) print( '[{:d}/{:d}] <<<TRAIN>>> lr({:.10f}) loss({:.4f}) top1({:.3f}) top5({:.3f})' .format(epoch + 1, runner.train_epochs, runner.optimizer.param_groups[0]['lr'], losses.avg, top1.avg, top5.avg)) runner.scheduler.step()
def evaluate(conf, model, test_loader): model.eval() top1 = AverageMeter() top5 = AverageMeter() with torch.no_grad(): for i, data in enumerate(test_loader): timages = data[0].type(torch.FloatTensor) tlabels = data[1].type(torch.LongTensor) if conf.get()['cuda']['avail']: timages, tlabels = timages.to( conf.get()['cuda']['device']), tlabels.to( conf.get()['cuda']['device']) toutputs = model(timages) ttop1, ttop5 = accuracy(toutputs, tlabels, (1, 5)) top1.update(ttop1.item(), timages.size(0)) top5.update(ttop5.item(), timages.size(0)) print('<<<TEST>>> top1({:.4f}) top5({:.4f})'.format( top1.avg, top5.avg))
def evaluate(dataloader, model, dev, topk=(1, )): """ :param dataloader: :param model: :param dev: devices, gpu or cpu :param topk: [tuple] output the top topk accuracy :return: [list[float]] topk accuracy """ model.eval() test_accuracy = AverageMeter() test_accuracy.reset() with torch.no_grad(): for _, sample in enumerate(tqdm(dataloader, ncols=100, ascii=' >')): x = sample['data'].to(dev) y = sample['label'].to(dev) output = model(x) logits = output['logits'] acc = accuracy(logits, y, topk) test_accuracy.update(acc[0], x.size(0)) return test_accuracy.avg
def train(self, train_loader, val_loader, optimizer, lr_scheduler, tb_logger): print_freq = self.config.logging.print_freq # meters batch_time = AverageMeter(print_freq) data_time = AverageMeter(print_freq) # track stats of min width, max width and random width loss_meter = [AverageMeter(print_freq) for _ in range(3)] top1_meter = [AverageMeter(print_freq) for _ in range(3)] top5_meter = [AverageMeter(print_freq) for _ in range(3)] meters = [top1_meter, top5_meter, loss_meter, data_time] criterions = self._get_criterion() self._sample_width() end = time.time() for e in range(self.cur_epoch, self.config.training.epoch): # train self.model.train() if self.config.distributed.enable: train_loader.sampler.set_epoch(e) for batch_idx, (x, y) in enumerate(train_loader): x, y = x.cuda(), y.cuda() self._train_one_batch(x, y, optimizer, lr_scheduler, meters, criterions, end) batch_time.update(time.time() - end) end = time.time() cur_lr = lr_scheduler.get_lr()[0] self._logging(tb_logger, e, batch_idx, len(train_loader), meters + [batch_time], cur_lr) # validation self.validate(val_loader, train_loader, self.config.validation.width, tb_logger=tb_logger) self.save(optimizer, lr_scheduler, e)
def validate(val_loader, model, criterion, epoch, start_time): """Run the validation set through a trained classifier. Params: val_loader: Loader for validation set. model: Classifier instance. criterion: Loss function. epoch: Current training epoch. start_time: Time when training started. """ timer = TimeMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.eval() eval_start_time = time.time() for i, (imgs, target) in enumerate(val_loader): imgs, target = imgs.cuda(), target.cuda() # terminates epoch early if args.short_epoch and (i > 10): break batch_num = i + 1 timer.batch_start() if args.distributed: top1acc, top5acc, loss, batch_total = distributed_predict( imgs, target, model, criterion) else: with torch.no_grad(): output = model(imgs) loss = criterion(output, target).data batch_total = imgs.size(0) top1acc, top5acc = accuracy(output.data, target, topk=(1, 5)) # Eval batch done. Logging results timer.batch_end() losses.update(loss, batch_total) top1.update(top1acc, batch_total) top5.update(top5acc, batch_total) if should_print(batch_num, val_loader, args) is True: output = ( f'Test: [{epoch}][{batch_num}/{len(val_loader)}]\t' f'Time {timer.batch_time.val:.3f} ({timer.batch_time.avg:.3f})\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' f'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' f'Acc@5 {top5.val:.3f} ({top5.avg:.3f})') log.verbose(output) tb.log_eval(top1.avg, top5.avg, time.time() - eval_start_time) tb.log('epoch', epoch) return top1.avg, top5.avg
def main(args): # SYN Dataset dataset = SYN( root=args.root, w=args.width, h=args.height, t=args.time, dataset='train', train=True, avi_dir=args.avi_dir, usual_transform=False, ) # Pytorch dataloader dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=args.workers, pin_memory=args.cuda, collate_fn=my_collate) # Loop data_time = AverageMeter() start_data = time.time() for i, dict_input in enumerate(dataloader): duration_data = time.time() - start_data data_time.update(duration_data) # Get the data clip, skeleton = dict_input['clip'], dict_input[ 'skeleton'] # (B, C, T, 224, 224), (B, T, 2, 25, 2) # Show show_one_img(clip[0, :, 0], skeleton[0, 0]) print("{}/{} : {time.val:.3f} ({time.avg:.3f}) sec/batch".format( i + 1, len(dataloader), time=data_time)) sys.stdout.flush() start_data = time.time()
def train(self, epoch, data_loader, optimizer, print_freq, use_gpu): ''' :param epoch: train epoch :param data_loader: train_loader variable :param optimizer: optim we define in the main.py :param print_freq: in config.yml :param use_gpu: get from the torch.cuda.is_availible :param eval: in config.yml ''' self.model.train() batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() # precisions = AverageMeter() end = time.time() for indx, inputs in enumerate(data_loader): ''' because different dataset should be handled differently, so we set the function self._parse_data() to get data for different dataset self._parse_data() == # if use_gpu: # imgs, pids = imgs.cuda(), pids.cuda() ''' imgs, pids = self._parse_data(inputs=inputs, use_gpu=use_gpu) data_time.update(time.time() - end) loss = self._forward(model=self.model, imgs=imgs, pids=pids, eval=self.eval) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() losses.update(loss.item(), pids.size(0)) if (indx + 1) % print_freq == 0: print('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch + 1, indx + 1, len(data_loader), batch_time=batch_time, data_time=data_time, loss=losses))
def train(train_loader, model, optimizer, epoch, device='cpu', print_freq=100, parallel=False, save_prefix='checkpoint', save_path=None): loss_total = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() end = time.time() model.train() for i, (img_a, patch_a, patch_b, corners, gt_delta) in enumerate(train_loader): data_time.update(time.time() - end) img_a, patch_a, patch_b, corners, delta = \ img_a.to(device), patch_a.to(device), patch_b.to(device), corners.to(device), gt_delta.to(device) model_inp = { 'img_a': img_a, 'patch_a': patch_a, 'patch_b': patch_b, 'corners': corners } loss = model.forward_train(model_inp) loss_total.update(loss.item(), img_a.size(0)) # backward optimizer.zero_grad() loss.backward() optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: print('Epoch: [{}][{}/{}] time: {:.3f} loss_total: {:.4f}'.format( epoch, i, len(train_loader), batch_time.avg, loss_total.avg)) print('Epoch: [{}] Time cost: {:.3f}'.format(epoch, batch_time.sum)) if save_path: state_dict = model.module.state_dict( ) if parallel else model.state_dict() torch.save( state_dict, os.path.join(save_path, '%s_epoch_%d.pth' % (save_prefix, epoch)))
def validate(dataloader, target_iter, classifier, device, args): batch_time = AverageMeter('Time', ':6.3f') losses = AverageMeter('Loss', ':.4e') top1 = AverageMeter('Acc@1', ':6.2f') top5 = AverageMeter('Acc@5', ':6.2f') progress = ProgressMeter(len(target_iter), [batch_time, losses, top1, top5], prefix='Test: ') classifier.eval() if args.per_class_eval: classes = dataloader.dataset.classes confmat = ConfusionMatrix(len(classes)) else: confmat = None with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(dataloader): images, target = images.to(device), target.to(device) output, _ = classifier(images) loss = F.cross_entropy(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) if confmat: confmat.update(target, output.argmax(1)) losses.update(loss.item(), images.size(0)) top1.update(acc1.item(), images.size(0)) top5.update(acc5.item(), images.size(0)) batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) if confmat: print(confmat.format(classes)) return top1.avg
def evaluate(self, method, epoch): self.model.eval() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() with torch.no_grad(): if self.train_precision == 'fp16': self.model.half() for i, data in enumerate(tqdm(self.test_loader)): if conf.get()['data']['dali']['avail']: timages = data[0]["data"] tlabels = data[0]["label"].squeeze().long() else: timages = data[0].type(torch.FloatTensor) tlabels = data[1].type(torch.LongTensor) if self.conf.get()['cuda']['avail']: timages, tlabels = timages.to(self.device), tlabels.to( self.device) if self.train_precision == 'fp16': timages = timages.half() toutputs = self.model(timages) tloss = self.criterion(toutputs, tlabels) ttop1, ttop5 = self.accuracy(toutputs, tlabels, (1, 5)) losses.update(tloss.item(), timages.size(0)) top1.update(ttop1.item(), timages.size(0)) top5.update(ttop5.item(), timages.size(0)) if self.best_acc_top1 < top1.avg: self.best_acc_top1 = top1.avg if self.best_acc_top5 < top5.avg: self.best_acc_top5 = top5.avg self.save_checkpoint(method, "best") if self.best_test_loss > losses.avg: self.best_test_loss = losses.avg print( '[{:d}/{:d}] <<<TEST>>> loss({:.4f}) top1({:.4f}) top5({:.4f}) best-top1({:.4f}) best-top5({:.4f})' .format(epoch + 1, self.train_epochs, losses.avg, top1.avg, top5.avg, self.best_acc_top1, self.best_acc_top5))
def validate(data_loader, G, F1, F2, args): batch_time = AverageMeter('Time', ':6.3f') top1_1 = AverageMeter('Acc_1', ':6.2f') top1_2 = AverageMeter('Acc_2', ':6.2f') progress = ProgressMeter(len(data_loader), [batch_time, top1_1, top1_2], prefix='Test: ') G.eval() F1.eval() F2.eval() if args.per_class_eval: classes = data_loader.dataset.classes confmat = ConfusionMatrix(len(classes)) else: confmat = None with torch.no_grad(): end = time.time() for i, (images, target) in enumerate(data_loader): images = images.to(device) target = target.to(device) g = G(images) y1, y2 = F1(g), F2(g) acc1, = accuracy(y1, target) acc2, = accuracy(y2, target) if confmat: confmat.update(target, y1.argmax(1)) top1_1.update(acc1.item(), images.size(0)) top1_2.update(acc2.item(), images.size(0)) batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: progress.display(i) print(' * Acc1 {top1_1.avg:.3f} Acc2 {top1_2.avg:.3f}'.format(top1_1=top1_1, top1_2=top1_2)) if confmat: print(confmat.format(classes)) return top1_1.avg, top1_2.avg
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query, writer): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS tmp_input_data = torch.rand( (10, 3, cfg.INPUT.SIZE_TRAIN[0], cfg.INPUT.SIZE_TRAIN[1])) writer.add_graph(model, (tmp_input_data, )) logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.cuda() else: if cfg.SOLVER.FP16: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') loss_meter = AverageMeter() id_loss_meter = AverageMeter() tri_loss_meter = AverageMeter() cen_loss_meter = AverageMeter() acc_meter = AverageMeter() lr_meter = AverageMeter() if cfg.SOLVER.SWA: swa_model = torch.optim.swa_utils.AveragedModel(model) # train for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() id_loss_meter.reset() tri_loss_meter.reset() cen_loss_meter.reset() lr_meter.reset() model.train() if cfg.SOLVER.GRADUAL_UNLOCK: model.base.gradual_unlock(cfg.SOLVER.MAX_EPOCHS, epoch) for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) if cfg.DATASETS.MIXUP: img, target_a, target_b, lam = mixup_data(img, target) score, feat = model(img, target) if cfg.DATASETS.MIXUP: all_loss = mixup_criterion(loss_fn, score, feat, target_a, target_b, lam) else: all_loss = loss_fn(score, feat, target) loss, id_loss, tri_loss, cen_loss = all_loss if cfg.SOLVER.FP16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() loss_meter.update(loss.item(), img.shape[0]) id_loss_meter.update(id_loss.item(), img.shape[0]) if torch.is_tensor(tri_loss): tri_loss_meter.update(tri_loss.item(), img.shape[0]) else: tri_loss_meter.update(tri_loss, 1) if torch.is_tensor(cen_loss): cen_loss_meter.update(cen_loss.item(), img.shape[0]) else: cen_loss_meter.update(cen_loss, 1) acc = (score.max(1)[1] == target).float().mean() acc_meter.update(acc, 1) lr_meter.update(scheduler.get_last_lr()[0]) writer.add_scalar('data/total_loss', loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/id_loss', id_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/tri_loss', tri_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/cen_loss', cen_loss_meter.avg, (epoch - 1) * len(train_loader) + n_iter) writer.add_scalar('data/learning_rate', lr_meter.avg, (epoch - 1) * len(train_loader) + n_iter) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_last_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: src_path = os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)) torch.save(model.state_dict(), src_path) try: dest_root = os.path.join( '/mnt/nfs-internstorage/user/zjf/NAIC2020/models', cfg.SAVE_FLAG) if not os.path.exists(dest_root): os.mkdir(dest_root) dst_path = os.path.join( dest_root, cfg.MODEL.NAME + '_{}.pth'.format(epoch)) shutil.copy(src_path, dst_path) except: print('No bak models...') pass if cfg.SOLVER.SWA and epoch in cfg.SOLVER.SWA_START: swa_model.update_parameters(model) logger.info('swa combine the {} epoch model'.format(epoch)) if cfg.SOLVER.SWA: try: swa_model.cpu() torch.optim.swa_utils.update_bn(train_loader, swa_model) swa_model.cuda() src_path = os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_swa.pth') torch.save(swa_model.state_dict(), src_path) logger.info('swa model is successfuly saved.') except: logger.info('swa model save failed.')
def train(trn_loader, model, criterion, optimizer, scheduler, epoch, G, args): """Train the classifier for a single epoch. Params: trn_loader: Loader for training set. model: Classifier instance. criterion: Loss function. optimizer: Optimization algorithm to use for training. scheduler: Learning rate scheduler. epoch: Current training epoch. G: Model of natural variation. args: Command line arguments. """ net_meter = NetworkMeter() timer = TimeMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() model.train() for i, (imgs, target) in enumerate(trn_loader): if args.short_epoch and (i > 10): break imgs, target = imgs.cuda(), target.cuda() batch_num = i + 1 timer.batch_start() scheduler.update_lr(epoch, i + 1, len(trn_loader)) if args.mda is True: imgs, target = mda_train(imgs, target, model, G, args) elif args.mrt is True: imgs, target = mrt_train(imgs, target, model, criterion, G, args) elif args.mat is True: imgs, target = mat_train(imgs, target, model, criterion, G, args) elif args.pgd is True: imgs, target = pgd_train(imgs, target, model, criterion) output = model(imgs) loss = criterion(output, target) optimizer.zero_grad() if args.half_prec: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() timer.batch_end() corr1, corr5 = correct(output.data, target, topk=(1, 5)) reduced_loss, batch_total = loss.data.item(), imgs.size(0) if args.distributed is True: metrics = torch.tensor([batch_total, reduced_loss, corr1, corr5]).float().cuda() batch_total, reduced_loss, corr1, corr5 = dist_utils.sum_tensor( metrics).cpu().numpy() reduced_loss = reduced_loss / dist_utils.env_world_size() else: corr1, corr5 = corr1.item(), corr5.item() top1acc = corr1 * (100.0 / batch_total) top5acc = corr5 * (100.0 / batch_total) losses.update(reduced_loss, batch_total) top1.update(top1acc, batch_total) top5.update(top5acc, batch_total) if should_print(batch_num, trn_loader, args) is True: tb.log_memory() tb.log_trn_times(timer.batch_time.val, timer.data_time.val, imgs.size(0)) tb.log_trn_loss(losses.val, top1.val, top5.val) recv_gbit, transmit_gbit = net_meter.update_bandwidth() tb.log("sizes/batch_total", batch_total) tb.log('net/recv_gbit', recv_gbit) tb.log('net/transmit_gbit', transmit_gbit) output = ( f'Epoch: [{epoch}][{batch_num}/{len(trn_loader)}]\t' f'Time {timer.batch_time.val:.3f} ({timer.batch_time.avg:.3f})\t' f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' f'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' f'Acc@5 {top5.val:.3f} ({top5.avg:.3f})\t' f'Data {timer.data_time.val:.3f} ({timer.data_time.avg:.3f})\t' f'BW {recv_gbit:.3f} {transmit_gbit:.3f}') log.verbose(output) tb.update_step_count(batch_total)
def do_train(cfg, model, center_criterion, train_loader, val_loader, optimizer, optimizer_center, scheduler, loss_fn, num_query): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD eval_period = cfg.SOLVER.EVAL_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS start_epoch = cfg.SOLVER.START_EPOCH logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) model.to(device) loss_meter = AverageMeter() acc_meter1 = AverageMeter() acc_meter2 = AverageMeter() # acc_cam = AverageMeter() evaluator = R1_mAP_eval(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM) if torch.cuda.device_count() > 1: model.module.base._freeze_stages() else: model.base._freeze_stages() logger.info('Freezing the stages number:{}'.format(cfg.MODEL.FROZEN)) # train for epoch in range(start_epoch, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter1.reset() acc_meter2.reset() # acc_cam.reset() evaluator.reset() scheduler.step() model.train() for n_iter, (img, vid, _) in enumerate(train_loader): optimizer.zero_grad() optimizer_center.zero_grad() img = img.to(device) target = vid.to(device) # camid = camid.to(device) scores, feat = model(img, target) loss = loss_fn(scores, feat, target) loss.backward() optimizer.step() if 'center' in cfg.MODEL.METRIC_LOSS_TYPE: for param in center_criterion.parameters(): param.grad.data *= (1. / cfg.SOLVER.CENTER_LOSS_WEIGHT) optimizer_center.step() acc = [(score.max(1)[1] == target).float().mean() for score in scores] # cam_acc = (cam_score.max(1)[1] == camid).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter1.update(acc[0].item(), 1) acc_meter2.update(acc[1].item(), 1) # acc_cam.update(cam_acc.item(), 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc1: {:.3f}, Acc2: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter1.avg, acc_meter2.avg, scheduler.get_lr()[0])) end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: if torch.cuda.device_count() > 1: torch.save( { 'static_dict': model.module.state_dict(), 'optimizer_static_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) else: torch.save( { 'static_dict': model.state_dict(), 'optimizer_static_dict': optimizer.state_dict() }, os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch))) if epoch % eval_period == 0: model.eval() for n_iter, (img, vid, camid, _, _) in enumerate(val_loader): with torch.no_grad(): img = img.to(device) feat = model(img) evaluator.update((feat, vid, camid)) cmc, mAP, _, _, _, _, _ = evaluator.compute() logger.info("Validation Results - Epoch: {}".format(epoch)) logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format( r, cmc[r - 1]))
def do_train(cfg, model, train_loader, optimizer, scheduler, loss_fn): log_period = cfg.SOLVER.LOG_PERIOD checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD device = "cuda" epochs = cfg.SOLVER.MAX_EPOCHS logger = logging.getLogger("reid_baseline.train") logger.info('start training') if device: model.to(device) if torch.cuda.device_count() > 1: print('Using {} GPUs for training'.format( torch.cuda.device_count())) model = nn.DataParallel(model) loss_meter = AverageMeter() acc_meter = AverageMeter() # train scaler = GradScaler() for epoch in range(1, epochs + 1): start_time = time.time() loss_meter.reset() acc_meter.reset() model.train() for n_iter, (img, vid) in enumerate(train_loader): optimizer.zero_grad() if cfg.INPUT.AUGMIX: bs = img[0].size(0) images_cat = torch.cat(img, dim=0).to( device) # [3 * batch, 3, 32, 32] target = vid.to(device) with autocast(): logits, feat = model(images_cat, target) logits_orig, logits_augmix1, logits_augmix2 = logits[:bs], logits[ bs:2 * bs], logits[2 * bs:] loss = loss_fn(logits_orig, feat, target) p_orig, p_augmix1, p_augmix2 = F.softmax( logits_orig, dim=-1), F.softmax(logits_augmix1, dim=-1), F.softmax(logits_augmix2, dim=-1) # Clamp mixture distribution to avoid exploding KL divergence p_mixture = torch.clamp( (p_orig + p_augmix1 + p_augmix2) / 3., 1e-7, 1).log() loss += 12 * ( F.kl_div(p_mixture, p_orig, reduction='batchmean') + F.kl_div(p_mixture, p_augmix1, reduction='batchmean') + F.kl_div(p_mixture, p_augmix2, reduction='batchmean')) / 3. else: img = img.to(device) target = vid.to(device) with autocast(): if cfg.MODEL.CHANNEL_HEAD: score, feat, channel_head_feature = model(img, target) #print(feat.shape, channel_head_feature.shape) loss = loss_fn(score, feat, channel_head_feature, target) else: score, feat = model(img, target) loss = loss_fn(score, feat, target) scaler.scale(loss).backward() scaler.step(optimizer) scaler.update() acc = (score.max(1)[1] == target).float().mean() loss_meter.update(loss.item(), img.shape[0]) acc_meter.update(acc, 1) if (n_iter + 1) % log_period == 0: logger.info( "Epoch[{}] Iteration[{}/{}] Loss: {:.3f}, Acc: {:.3f}, Base Lr: {:.2e}" .format(epoch, (n_iter + 1), len(train_loader), loss_meter.avg, acc_meter.avg, scheduler.get_lr()[0])) scheduler.step() end_time = time.time() time_per_batch = (end_time - start_time) / (n_iter + 1) logger.info( "Epoch {} done. Time per batch: {:.3f}[s] Speed: {:.1f}[samples/s]" .format(epoch, time_per_batch, train_loader.batch_size / time_per_batch)) if epoch % checkpoint_period == 0: torch.save( model.state_dict(), os.path.join(cfg.OUTPUT_DIR, cfg.MODEL.NAME + '_{}.pth'.format(epoch)))
def main(cfg, device): init_seeds() cfg.use_fp16 = False if device.type == 'cpu' else cfg.use_fp16 # logging ---------------------------------------------------------------------------------------------------------------------------------------- logger_root = f'Results/{cfg.dataset}' if not os.path.isdir(logger_root): os.makedirs(logger_root, exist_ok=True) logtime = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') result_dir = os.path.join(logger_root, f'{logtime}-{cfg.log}') # result_dir = os.path.join(logger_root, f'ablation_study-{cfg.log}') #TODO logger = Logger(logging_dir=result_dir, DEBUG=False) logger.set_logfile(logfile_name='log.txt') save_params(cfg, f'{result_dir}/params.json', json_format=True) logger.debug(f'Result Path: {result_dir}') # model, optimizer, scheduler -------------------------------------------------------------------------------------------------------------------- opt_lvl = 'O1' if cfg.use_fp16 else 'O0' n_classes = cfg.n_classes net1 = ResNet(arch=cfg.net1, num_classes=n_classes, pretrained=True) optimizer1 = build_sgd_optimizer(net1.parameters(), cfg.lr, cfg.weight_decay) net1, optimizer1 = amp.initialize(net1.to(device), optimizer1, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0) net2 = ResNet(arch=cfg.net2, num_classes=n_classes, pretrained=True) optimizer2 = build_sgd_optimizer(net2.parameters(), cfg.lr, cfg.weight_decay) net2, optimizer2 = amp.initialize(net2.to(device), optimizer2, opt_level=opt_lvl, keep_batchnorm_fp32=None, loss_scale=None, verbosity=0) lr_plan = make_lr_plan(cfg.lr, cfg.stage1, cfg.epochs) with open(f'{result_dir}/network.txt', 'w') as f: f.writelines(net1.__repr__()) f.write('\n\n---------------------------\n\n') f.writelines(net1.__repr__()) # drop rate scheduler ---------------------------------------------------------------------------------------------------------------------------- T_k = cfg.stage1 final_drop_rate = 0.25 final_ldl_rate = cfg.ldl_rate drop_rate_scheduler = np.ones(cfg.epochs) * final_drop_rate drop_rate_scheduler[:T_k] = np.linspace(0, final_drop_rate, T_k) drop_rate_scheduler[T_k:cfg.epochs] = np.linspace(final_drop_rate, final_ldl_rate, cfg.epochs - T_k) # dataset, dataloader ---------------------------------------------------------------------------------------------------------------------------- transform = build_transform(rescale_size=cfg.rescale_size, crop_size=cfg.crop_size) dataset = build_webfg_dataset(os.path.join(cfg.database, cfg.dataset), CLDataTransform(transform['train']), transform['test']) logger.debug(f"Number of Training Samples: {dataset['n_train_samples']}") logger.debug(f"Number of Testing Samples: {dataset['n_test_samples']}") train_loader = DataLoader(dataset['train'], batch_size=cfg.batch_size, shuffle=True, num_workers=8, pin_memory=True) test_loader = DataLoader(dataset['test'], batch_size=16, shuffle=False, num_workers=8, pin_memory=True) # meters ----------------------------------------------------------------------------------------------------------------------------------------- train_loss1, train_loss2 = AverageMeter(), AverageMeter() train_accuracy1, train_accuracy2 = AverageMeter(), AverageMeter() iter_time = AverageMeter() # training --------------------------------------------------------------------------------------------------------------------------------------- start_epoch = 0 best_accuracy1, best_accuracy2 = 0.0, 0.0 best_epoch1, best_epoch2 = None, None if cfg.dataset == 'cifar100' and cfg.noise_type != 'clean': t = torch.tensor(dataset['train'].noisy_labels) else: t = torch.tensor(dataset['train'].targets) labels2learn1 = torch.full(size=(dataset['n_train_samples'], n_classes), fill_value=0.0) labels2learn1.scatter_(dim=1, index=torch.unsqueeze(t, dim=1), value=1.0 * 10) labels2learn2 = labels2learn1 flag = [0, 0, 0] for epoch in range(start_epoch, cfg.epochs): start_time = time.time() train_loss1.reset() train_accuracy1.reset() train_loss2.reset() train_accuracy2.reset() net1.train() net2.train() adjust_lr(optimizer1, lr_plan[epoch]) adjust_lr(optimizer2, lr_plan[epoch]) optimizer1.zero_grad() optimizer2.zero_grad() # train this epoch for it, sample in enumerate(train_loader): s = time.time() # optimizer1.zero_grad() # optimizer2.zero_grad() indices = sample['index'] x1, x2 = sample['data'] x1, x2 = x1.to(device), x2.to(device) y0 = sample['label'].to(device) y = get_smoothed_label_distribution(y0, nc=n_classes, epsilon=cfg.epsilon) output1 = net1(x1) output2 = net2(x2) logits1 = output1['logits'] logits2 = output2['logits'] if epoch < cfg.stage1: # warmup if flag[0] == 0: step_flagging('stage 1') flag[0] += 1 loss1 = cross_entropy(logits1, y) loss2 = cross_entropy(logits2, y) else: # learn label distributions if flag[1] == 0: step_flagging('stage 2') flag[1] += 1 with torch.no_grad(): cce_losses1 = cross_entropy(logits1, y, reduction='none') cce_losses2 = cross_entropy(logits2, y, reduction='none') losses1 = cce_losses1 losses2 = cce_losses2 # ent_losses1 = entropy_loss(logits1, reduction='none') # ent_losses2 = entropy_loss(logits2, reduction='none') # losses1 = cce_losses1 + ent_losses1 # (N) # losses2 = cce_losses2 + ent_losses2 # (N) sample_selection = sample_selector(losses1, losses2, drop_rate_scheduler[epoch]) # for selected "clean" samples, train in a co-teaching manner logits_clean1 = logits1[sample_selection['clean2']] logits_clean2 = logits2[sample_selection['clean1']] y_clean1 = y[sample_selection['clean2']] y_clean2 = y[sample_selection['clean1']] losses_clean1 = cross_entropy(logits_clean1, y_clean1, reduction='none') + entropy_loss(logits_clean1, reduction='none') # (Nc1) losses_clean2 = cross_entropy(logits_clean2, y_clean2, reduction='none') + entropy_loss(logits_clean2, reduction='none') # (Nc2) loss_c1_1 = losses_clean1.mean() loss_c2_1 = losses_clean2.mean() # for selected "unclean" samples, train in a label distribution learning manner (exchange again) y_t1 = labels2learn1[indices, :].clone().to(device) y_t2 = labels2learn2[indices, :].clone().to(device) y_t1.requires_grad = True y_t2.requires_grad = True y_d1 = F.softmax(y_t1, dim=1) + 1e-8 y_d2 = F.softmax(y_t2, dim=1) + 1e-8 logits_unclean1 = logits1[sample_selection['unclean2']] logits_unclean2 = logits2[sample_selection['unclean1']] y_d_unclean1 = y_d1[sample_selection['unclean2']] y_d_unclean2 = y_d2[sample_selection['unclean1']] w1 = np.random.beta(cfg.phi, cfg.phi, logits_unclean1.size(0)) w2 = np.random.beta(cfg.phi, cfg.phi, logits_unclean2.size(0)) w1 = x1.new(w1).view(logits_unclean1.size(0), 1, 1, 1) w2 = x2.new(w2).view(logits_unclean2.size(0), 1, 1, 1) idx1 = np.random.choice(sample_selection['clean2'].cpu().numpy(), logits_unclean1.size(0), replace=False if sample_selection['clean2'].size(0) >= logits_unclean1.size(0) else True) idx1 = torch.tensor(idx1).to(device) idx2 = np.random.choice(sample_selection['clean1'].cpu().numpy(), logits_unclean2.size(0), replace=False if sample_selection['clean1'].size(0) >= logits_unclean2.size(0) else True) idx2 = torch.tensor(idx2).to(device) mixed_x1 = w1 * x1[sample_selection['unclean2']] + (1-w1) * x1[idx1] mixed_x2 = w2 * x2[sample_selection['unclean1']] + (1-w2) * x2[idx2] mixed_y1 = w1 * y_d_unclean1 + (1-w1) * y_d1[idx1] mixed_y2 = w2 * y_d_unclean2 + (1-w2) * y_d2[idx2] mixed_output1 = net1(mixed_x1) mixed_output2 = net2(mixed_x2) mixed_logits1 = mixed_output1['logits'] mixed_logits2 = mixed_output2['logits'] loss_c1_2 = kl_div(F.softmax(mixed_logits1, dim=1) + 1e-8, mixed_y1).mean() loss_c2_2 = kl_div(F.softmax(mixed_logits2, dim=1) + 1e-8, mixed_y2).mean() loss_c1 = loss_c1_1 + loss_c1_2 * cfg.beta loss_c2 = loss_c2_1 + loss_c2_2 * cfg.beta # consistency loss loss_o1 = cross_entropy(F.softmax(y_t1[sample_selection['clean2']], dim=1), y[sample_selection['clean2']]) loss_o2 = cross_entropy(F.softmax(y_t2[sample_selection['clean1']], dim=1), y[sample_selection['clean1']]) # final loss loss1 = (1 - cfg.alpha) * loss_c1 + cfg.alpha * loss_o1 loss2 = (1 - cfg.alpha) * loss_c2 + cfg.alpha * loss_o2 train_acc1 = accuracy(logits1, y0, topk=(1,)) train_acc2 = accuracy(logits2, y0, topk=(1,)) train_loss1.update(loss1.item(), x1.size(0)) train_loss2.update(loss2.item(), x2.size(0)) train_accuracy1.update(train_acc1[0], x1.size(0)) train_accuracy2.update(train_acc2[0], x2.size(0)) if cfg.use_fp16: with amp.scale_loss(loss1, optimizer1) as scaled_loss1: scaled_loss1.backward() with amp.scale_loss(loss2, optimizer2) as scaled_loss2: scaled_loss2.backward() else: loss1.backward() loss2.backward() optimizer1.step() optimizer2.step() optimizer1.zero_grad() optimizer2.zero_grad() if epoch >= cfg.stage1: y_t1.data.sub_(cfg.lmd * y_t1.grad.data) y_t2.data.sub_(cfg.lmd * y_t2.grad.data) labels2learn1[indices, :] = y_t1.detach().clone().cpu().data labels2learn2[indices, :] = y_t2.detach().clone().cpu().data del y_t1, y_t2 iter_time.update(time.time() - s, 1) if (cfg.log_freq is not None and (it + 1) % cfg.log_freq == 0) or (it + 1 == len(train_loader)): total_mem = torch.cuda.get_device_properties(0).total_memory / 2**30 mem = torch.cuda.memory_reserved() / 2**30 console_content = f"Epoch:[{epoch + 1:>3d}/{cfg.epochs:>3d}] " \ f"Iter:[{it + 1:>4d}/{len(train_loader):>4d}] " \ f"Train Accuracy 1:[{train_accuracy1.avg:6.2f}] " \ f"Train Accuracy 2:[{train_accuracy2.avg:6.2f}] " \ f"Loss 1:[{train_loss1.avg:4.4f}] " \ f"Loss 2:[{train_loss2.avg:4.4f}] " \ f"GPU-MEM:[{mem:6.3f}/{total_mem:6.3f} Gb] " \ f"{iter_time.avg:6.2f} sec/iter" logger.debug(console_content) # evaluate this epoch test_accuracy1 = evaluate(test_loader, net1, device) test_accuracy2 = evaluate(test_loader, net2, device) if test_accuracy1 > best_accuracy1: best_accuracy1 = test_accuracy1 best_epoch1 = epoch + 1 torch.save(net1.state_dict(), f'{result_dir}/net1_best_epoch.pth') if test_accuracy2 > best_accuracy2: best_accuracy2 = test_accuracy2 best_epoch2 = epoch + 1 torch.save(net2.state_dict(), f'{result_dir}/net2_best_epoch.pth') # logging this epoch runtime = time.time() - start_time logger.info(f'epoch: {epoch + 1:>3d} | ' f'train loss(1/2): ({train_loss1.avg:>6.4f}/{train_loss2.avg:>6.4f}) | ' f'train accuracy(1/2): ({train_accuracy1.avg:>6.3f}/{train_accuracy2.avg:>6.3f}) | ' f'test accuracy(1/2): ({test_accuracy1:>6.3f}/{test_accuracy2:>6.3f}) | ' f'epoch runtime: {runtime:6.2f} sec | ' f'best accuracy(1/2): ({best_accuracy1:6.3f}/{best_accuracy2:6.3f}) @ epoch: ({best_epoch1:03d}/{best_epoch2:03d})') plot_results_cotraining(result_file=f'{result_dir}/log.txt') torch.save(labels2learn1, f'{result_dir}/labels_learned.pt') # rename results dir ----------------------------------------------------------------------------------------------------------------------------- best_accuracy = max(best_accuracy1, best_accuracy2) os.rename(result_dir, f'{result_dir}-bestAcc_{best_accuracy:.4f}')