def run_main(self): """""" """Fix the random seed""" np.random.seed(self.seed) torch.manual_seed(self.seed) """RandAugment parameters""" if self.flag_randaug == 1: if self.rand_n == 0 and self.rand_m == 0: if self.n_model == 'ResNet': self.rand_n = 2 self.rand_m = 9 elif self.n_model == 'WideResNet': if self.n_data == 'CIFAR-10': self.rand_n = 3 self.rand_m = 5 elif self.n_data == 'CIFAR-100': self.rand_n = 2 self.rand_m = 14 elif self.n_data == 'SVHN': self.rand_n = 3 self.rand_m = 7 """Dataset""" traintest_dataset = dataset.MyDataset_training( n_data=self.n_data, num_data=self.num_training_data, seed=self.seed, flag_randaug=self.flag_randaug, rand_n=self.rand_n, rand_m=self.rand_m, cutout=self.cutout) self.num_channel, self.num_classes, self.size_after_cnn, self.input_size, self.hidden_size = traintest_dataset.get_info( n_data=self.n_data) n_samples = len(traintest_dataset) if self.num_training_data == 0: self.num_training_data = n_samples if self.flag_traintest == 1: # train_size = self.num_classes * 100 train_size = int(n_samples * 0.65) test_size = n_samples - train_size train_dataset, test_dataset = torch.utils.data.random_split( traintest_dataset, [train_size, test_size]) train_sampler = None test_sampler = None else: train_dataset = traintest_dataset test_dataset = dataset.MyDataset_test(n_data=self.n_data) train_sampler = train_dataset.sampler test_sampler = test_dataset.sampler num_workers = 16 train_shuffle = True test_shuffle = False if train_sampler: train_shuffle = False if test_sampler: test_shuffle = False self.train_loader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=self.batch_size_training, sampler=train_sampler, shuffle=train_shuffle, num_workers=num_workers, pin_memory=True) self.test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=self.batch_size_test, sampler=test_sampler, shuffle=test_shuffle, num_workers=num_workers, pin_memory=True) """Transfer learning""" if self.flag_transfer == 1: pretrained = True num_classes = 1000 else: pretrained = False num_classes = self.num_classes """Neural network model""" model = None if self.n_model == 'CNN': model = cnn.ConvNet(num_classes=self.num_classes, num_channel=self.num_channel, size_after_cnn=self.size_after_cnn, n_aug=self.n_aug) elif self.n_model == 'ResNet': model = resnet.ResNet(n_data=self.n_data, depth=50, num_classes=self.num_classes, num_channel=self.num_channel, n_aug=self.n_aug, bottleneck=True) # resnet50 # model = resnet.ResNet(n_data=self.n_data, depth=200, num_classes=self.num_classes, num_channel=self.num_channel, bottleneck=True) # resnet200 elif self.n_model == 'WideResNet': # model = resnet.wide_resnet50_2(num_classes=self.num_classes, num_channel=self.num_channel) # model = WideResNet(depth=40, iden_factor=2, dropout_rate=0.0, num_classes=num_class, num_channel=self.num_channel) # wresnet40_2 model = wideresnet.WideResNet(depth=28, widen_factor=10, dropout_rate=0.0, num_classes=self.num_classes, num_channel=self.num_channel, n_aug=self.n_aug) # wresnet28_10 print(torch.__version__) """Transfer learning""" if self.flag_transfer == 1: for param in model.parameters(): param.requires_grad = False num_features = model.fc.in_features model.fc = nn.Linear(num_features, self.num_classes) """Show paramters""" if self.show_params == 1: params = 0 for p in model.parameters(): if p.requires_grad: params += p.numel() print(params) """GPU setting""" device = 'cuda' if torch.cuda.is_available() else 'cpu' model = model.to(device) if device == 'cuda': if self.gpu_multi == 1: model = torch.nn.DataParallel(model) torch.backends.cudnn.benchmark = True print('GPU={}'.format(torch.cuda.device_count())) """Loss function""" if self.lb_smooth > 0.0: criterion = objective.SmoothCrossEntropyLoss(self.lb_smooth) else: criterion = objective.SoftCrossEntropy() """Optimizer""" optimizer = 0 if self.opt == 0: # Adam if self.flag_transfer == 1: optimizer = torch.optim.Adam(model.fc.parameters(), lr=0.001) else: optimizer = torch.optim.Adam(model.parameters(), lr=0.001) elif self.opt == 1: # SGD if self.n_model == 'ResNet': lr = 0.1 weight_decay = 0.0001 elif self.n_model == 'WideResNet': if self.n_data == 'SVHN': lr = 0.005 weight_decay = 0.001 else: lr = 0.1 weight_decay = 0.0005 else: lr = 0.1 weight_decay = 0.0005 optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay, nesterov=True) if self.flag_lars == 1: from torchlars import LARS optimizer = LARS(optimizer) """Learning rate scheduling""" scheduler = None if self.flag_lr_schedule == 2: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=self.num_epochs, eta_min=0.) elif self.flag_lr_schedule == 3: if self.num_epochs == 90: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, [30, 60, 80]) elif self.num_epochs == 180: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, [60, 120, 160]) elif self.num_epochs == 270: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, [90, 180, 240]) if self.flag_warmup == 1: if self.n_model == 'ResNet': multiplier = 2 total_epoch = 3 elif self.n_model == 'WideResNet': multiplier = 2 if self.n_data == 'SVHN': total_epoch = 3 else: total_epoch = 5 else: multiplier = 2 total_epoch = 3 scheduler = GradualWarmupScheduler(optimizer, multiplier=multiplier, total_epoch=total_epoch, after_scheduler=scheduler) """Initialize""" self.flag_noise = np.random.randint(0, 2, self.num_training_data) if self.flag_acc5 == 1: results = np.zeros((self.num_epochs, 6)) else: results = np.zeros((self.num_epochs, 5)) start_time = timeit.default_timer() t = 0 # fixed_interval = 10 fixed_interval = 1 loss_fixed_all = np.zeros(self.num_epochs // fixed_interval) self.loss_training_batch = np.zeros( int(self.num_epochs * np.ceil(self.num_training_data / self.batch_size_training))) for epoch in range(self.num_epochs): """Training""" model.train() start_epoch_time = timeit.default_timer( ) # Get the start time of this epoch loss_each_all = np.zeros(self.num_training_data) loss_training_all = 0 loss_test_all = 0 """Learning rate scheduling""" # if self.flag_lr_schedule == 1: # if self.num_epochs == 200: # if epoch == 100: # optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) if self.flag_variance == 1: # when computing the variance of training loss if epoch % fixed_interval == 0: loss_fixed = np.zeros(self.num_training_data // self.batch_size_training) for i, (images, labels, index) in enumerate(self.train_loader): if np.array(images.data.cpu()).ndim == 3: images = images.reshape(images.shape[0], 1, images.shape[1], images.shape[2]).to(device) else: images = images.to(device) labels = labels.to(device) flag_onehot = 0 if self.flag_spa == 1: outputs_fixed = model.forward(images) if flag_onehot == 0: labels = np.identity(self.num_classes)[labels] else: outputs_fixed = model(images) labels = np.identity(self.num_classes)[np.array( labels.data.cpu())] labels = util.to_var(torch.from_numpy(labels).float()) loss_fixed[i] = criterion.forward( outputs_fixed, labels) loss_fixed_all[t] = np.var(loss_fixed) t += 1 total_steps = len(self.train_loader) steps = 0 num_training_data = 0 for i, (images, labels, index) in enumerate(self.train_loader): steps += 1 if np.array(images.data.cpu()).ndim == 3: images = images.reshape(images.shape[0], 1, images.shape[1], images.shape[2]).to(device) else: images = images.to(device) labels = labels.to(device) """Save images""" if self.save_images == 1: util.save_images(images) """Get training loss for each sample by inputting data before applying data augmentation""" if self.flag_spa == 1: outputs = model(images) labels_spa = labels.clone() if labels_spa.ndim == 1: labels_spa = torch.eye( self.num_classes, device='cuda')[labels_spa].clone() # To one-hot loss_each = criterion.forward_each_example( outputs, labels_spa) # Loss for each sample loss_each_all[index] = np.array( loss_each.data.cpu()) # Put losses for target samples self.flag_noise = util.flag_update( loss_each_all, self.judge_noise) # Update the noise flag """Forward propagation""" if self.flag_spa == 1: images, labels = util.self_paced_augmentation( images=images, labels=labels, flag_noise=self.flag_noise, index=np.array(index.data.cpu()), n_aug=self.n_aug, num_classes=self.num_classes) else: images, labels = util.run_n_aug( x=images, y=labels, n_aug=self.n_aug, num_classes=self.num_classes) outputs = model(images) if labels.ndim == 1: labels = torch.eye(self.num_classes, device='cuda')[labels].clone() loss_training = criterion.forward(outputs, labels) loss_training_all += loss_training.item() * outputs.shape[0] # self.loss_training_batch[int(i + epoch * np.ceil(self.num_training_data / self.batch_size_training))] = loss_training * outputs.shape[0] num_training_data += images.shape[0] """Back propagation and update""" optimizer.zero_grad() loss_training.backward() optimizer.step() """When changing flag_noise randomly""" """ if self.flag_spa == 1: outputs = model.forward(x=images) if labels.ndim == 1: y_soft = torch.eye(self.num_classes, device='cuda')[labels] # Convert to one-hot loss_each = criterion.forward_each_example(outputs, y_soft) # Loss for each sample loss_each_all[index] = np.array(loss_each.data.cpu()) # Put losses for target samples self.flag_noise = util.flag_update(loss_each_all, self.judge_noise) # Update the noise flag """ loss_training_each = loss_training_all / num_training_data # np.random.shuffle(self.flag_noise) """Test""" model.eval() with torch.no_grad(): if self.flag_acc5 == 1: top1 = list() top5 = list() else: correct = 0 total = 0 num_test_data = 0 for images, labels in self.test_loader: if np.array(images.data).ndim == 3: images = images.reshape(images.shape[0], 1, images.shape[1], images.shape[2]).to(device) else: images = images.to(device) labels = labels.to(device) outputs = model(x=images) if self.flag_acc5 == 1: acc1, acc5 = util.accuracy(outputs.data, labels.long(), topk=(1, 5)) top1.append(acc1[0].item()) top5.append(acc5[0].item()) else: _, predicted = torch.max(outputs.data, 1) correct += (predicted == labels.long()).sum().item() total += labels.size(0) if labels.ndim == 1: labels = torch.eye(self.num_classes, device='cuda')[labels] loss_test = criterion.forward(outputs, labels) loss_test_all += loss_test.item() * outputs.shape[0] num_test_data += images.shape[0] """Compute test results""" top1_avg = 0 top5_avg = 0 test_accuracy = 0 if self.flag_acc5 == 1: top1_avg = sum(top1) / float(len(top1)) top5_avg = sum(top5) / float(len(top5)) else: test_accuracy = 100.0 * correct / total loss_test_each = loss_test_all / num_test_data """Compute running time""" end_epoch_time = timeit.default_timer() epoch_time = end_epoch_time - start_epoch_time num_flag = np.sum(self.flag_noise == 1) """Learning rate scheduling""" if self.flag_lr_schedule > 1 and scheduler is not None: scheduler.step(epoch - 1 + float(steps) / total_steps) """Show results""" flag_log = 1 if flag_log == 1: if self.flag_acc5 == 1: print( 'Epoch [{}/{}], Train Loss: {:.4f}, Top1 Test Acc: {:.3f} %, Top5 Test Acc: {:.3f} %, Test Loss: {:.4f}, Epoch Time: {:.2f}s, Num_flag: {}' .format(epoch + 1, self.num_epochs, loss_training_each, top1_avg, top5_avg, loss_test_each, epoch_time, num_flag)) else: print( 'Epoch [{}/{}], Train Loss: {:.4f}, Test Acc: {:.3f} %, Test Loss: {:.4f}, Epoch Time: {:.2f}s, Num_flag: {}' .format(epoch + 1, self.num_epochs, loss_training_each, test_accuracy, loss_test_each, epoch_time, num_flag)) if self.flag_wandb == 1: wandb.log({"loss_training_each": loss_training_each}) wandb.log({"test_accuracy": test_accuracy}) wandb.log({"loss_test_each": loss_test_each}) wandb.log({"num_flag": num_flag}) wandb.log({"epoch_time": epoch_time}) if self.save_file == 1: if flag_log == 1: if self.flag_acc5 == 1: results[epoch][0] = loss_training_each results[epoch][1] = top1_avg results[epoch][2] = top5_avg results[epoch][3] = loss_test_each results[epoch][4] = num_flag results[epoch][5] = epoch_time else: results[epoch][0] = loss_training_each results[epoch][1] = test_accuracy results[epoch][2] = loss_test_each results[epoch][3] = num_flag results[epoch][4] = epoch_time end_time = timeit.default_timer() flag_log = 1 if flag_log == 1: print(' ran for %.4fm' % ((end_time - start_time) / 60.)) """Show accuracy""" top1_avg_max = np.max(results[:, 1]) print(top1_avg_max) if flag_log == 1 and self.flag_acc5 == 1: top5_avg_max = np.max(results[:, 2]) print(top5_avg_max) """Save files""" if self.save_file == 1: if self.flag_randaug == 1: np.savetxt( 'results/data_%s_model_%s_num_%s_randaug_%s_n_%s_m_%s_seed_%s_acc_%s.csv' % (self.n_data, self.n_model, self.num_training_data, self.flag_randaug, self.rand_n, self.rand_m, self.seed, top1_avg_max), results, delimiter=',') else: if self.flag_spa == 1: np.savetxt( 'results/data_%s_model_%s_judge_%s_aug_%s_num_%s_seed_%s_acc_%s.csv' % (self.n_data, self.n_model, self.judge_noise, self.n_aug, self.num_training_data, self.seed, top1_avg_max), results, delimiter=',') else: np.savetxt( 'results/data_%s_model_%s_aug_%s_num_%s_seed_%s_acc_%s.csv' % (self.n_data, self.n_model, self.n_aug, self.num_training_data, self.seed, top1_avg_max), results, delimiter=',') if self.flag_variance == 1: np.savetxt('results/loss_variance_judge_%s_aug_%s_acc_%s.csv' % (self.judge_noise, self.n_aug, top1_avg_max), loss_fixed_all, delimiter=',')
def train(self, dataloader, temperature, ckpt_path, n_epochs=90, save_size=10): # trainers criterion = NTCrossEntropyLoss(temperature, self.batch_size, self.device).to(self.device) optimizer = LARS(torch.optim.SGD(self.model.parameters(), lr=4)) # optimizer = optimizer.to(self.device) losses = [] for epoch in range(n_epochs): with tqdm(total=len(dataloader)) as progress: running_loss = 0 i = 0 for (xis, xjs), _ in dataloader: i += 1 optimizer.zero_grad() xis = xis.to(self.device) xjs = xjs.to(self.device) # Get representations and projections his, zis = self.model(xis) hjs, zjs = self.model(xjs) # normalize zis = F.normalize(zis, dim=1) zjs = F.normalize(zjs, dim=1) loss = criterion(zis, zjs) running_loss += loss.item() # optimize loss.backward() optimizer.step() # update tqdm progress.set_description('train loss:{:.4f}'.format( loss.item())) progress.update() # record loss if i % save_size == (save_size - 1): losses.append(running_loss / save_size) running_loss = 0 # save model if epoch % 10 == 0: torch.save( { 'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': losses, }, ckpt_path + f'{epoch}') return self.return_model(), losses
def train(cfg, writer, logger): # Setup random seeds to a determinated value for reproduction # seed = 1337 # torch.manual_seed(seed) # torch.cuda.manual_seed(seed) # np.random.seed(seed) # random.seed(seed) # np.random.default_rng(seed) # Setup Augmentations augmentations = cfg.train.augment logger.info(f'using augments: {augmentations}') data_aug = get_composed_augmentations(augmentations) # Setup Dataloader data_loader = get_loader(cfg.data.dataloader) data_path = cfg.data.path logger.info("Using dataset: {}".format(data_path)) t_loader = data_loader( data_path, # transform=None, # time_shuffle = cfg.data.time_shuffle, # to_tensor=False, data_format = cfg.data.format, split=cfg.data.train_split, norm = cfg.data.norm, augments=data_aug ) v_loader = data_loader( data_path, # transform=None, # time_shuffle = cfg.data.time_shuffle, # to_tensor=False, data_format = cfg.data.format, split=cfg.data.val_split, ) train_data_len = len(t_loader) logger.info(f'num of train samples: {train_data_len} \nnum of val samples: {len(v_loader)}') batch_size = cfg.train.batch_size epoch = cfg.train.epoch train_iter = int(np.ceil(train_data_len / batch_size) * epoch) logger.info(f'total train iter: {train_iter}') trainloader = data.DataLoader(t_loader, batch_size=batch_size, num_workers=cfg.train.n_workers, shuffle=True, persistent_workers=True, drop_last=True) valloader = data.DataLoader(v_loader, batch_size=10, # persis num_workers=cfg.train.n_workers,) # Setup Model device = f'cuda:{cfg.gpu[0]}' model = get_model(cfg.model, 2).to(device) input_size = (cfg.model.input_nbr, 512, 512) logger.info(f"Using Model: {cfg.model.arch}") # logger.info(f'model summary: {summary(model, input_size=(input_size, input_size), is_complex=True)}') model = torch.nn.DataParallel(model, device_ids=cfg.gpu) #自动多卡运行,这个好用 # Setup optimizer, lr_scheduler and loss function optimizer_cls = get_optimizer(cfg) optimizer_params = {k:v for k, v in vars(cfg.train.optimizer).items() if k not in ('name', 'wrap')} optimizer = optimizer_cls(model.parameters(), **optimizer_params) logger.info("Using optimizer {}".format(optimizer)) if hasattr(cfg.train.optimizer, 'warp') and cfg.train.optimizer.wrap=='lars': optimizer = LARS(optimizer=optimizer) logger.info(f'warp optimizer with {cfg.train.optimizer.wrap}') scheduler = get_scheduler(optimizer, cfg.train.lr) loss_fn = get_loss_function(cfg) logger.info(f"Using loss ,{str(cfg.train.loss)}") # load checkpoints val_cls_1_acc = 0 best_cls_1_acc_now = 0 best_cls_1_acc_iter_now = 0 val_macro_OA = 0 best_macro_OA_now = 0 best_macro_OA_iter_now = 0 start_iter = 0 if cfg.train.resume is not None: if os.path.isfile(cfg.train.resume): logger.info( "Loading model and optimizer from checkpoint '{}'".format(cfg.train.resume) ) # load model state checkpoint = torch.load(cfg.train.resume) model.load_state_dict(checkpoint["model_state"]) optimizer.load_state_dict(checkpoint["optimizer_state"]) scheduler.load_state_dict(checkpoint["scheduler_state"]) # best_cls_1_acc_now = checkpoint["best_cls_1_acc_now"] # best_cls_1_acc_iter_now = checkpoint["best_cls_1_acc_iter_now"] start_iter = checkpoint["epoch"] logger.info( "Loaded checkpoint '{}' (iter {})".format( cfg.train.resume, checkpoint["epoch"] ) ) # copy tensorboard files resume_src_dir = osp.split(cfg.train.resume)[0] # shutil.copytree(resume_src_dir, writer.get_logdir()) for file in os.listdir(resume_src_dir): if not ('.log' in file or '.yml' in file or '_last_model' in file): # if 'events.out.tfevents' in file: resume_dst_dir = writer.get_logdir() fu.copy(osp.join(resume_src_dir, file), resume_dst_dir, ) else: logger.info("No checkpoint found at '{}'".format(cfg.train.resume)) # Setup Metrics running_metrics_val = runningScore(2) runing_metrics_train = runningScore(2) val_loss_meter = averageMeter() train_time_meter = averageMeter() # train it = start_iter train_start_time = time.time() train_val_start_time = time.time() model.train() while it < train_iter: for (file_a, file_b, label, mask) in trainloader: it += 1 file_a = file_a.to(device) file_b = file_b.to(device) label = label.to(device) mask = mask.to(device) optimizer.zero_grad() # print(f'dtype: {file_a.dtype}') outputs = model(file_a, file_b) loss = loss_fn(input=outputs, target=label, mask=mask) loss.backward() # print('conv11: ', model.conv11.weight.grad, model.conv11.weight.grad.shape) # print('conv21: ', model.conv21.weight.grad, model.conv21.weight.grad.shape) # print('conv31: ', model.conv31.weight.grad, model.conv31.weight.grad.shape) # In PyTorch 1.1.0 and later, you should call `optimizer.step()` before `lr_scheduler.step()` optimizer.step() scheduler.step() # record the acc of the minibatch pred = outputs.max(1)[1].cpu().numpy() runing_metrics_train.update(label.cpu().numpy(), pred, mask.cpu().numpy()) train_time_meter.update(time.time() - train_start_time) if it % cfg.train.print_interval == 0: # acc of the samples between print_interval score, _ = runing_metrics_train.get_scores() train_cls_0_acc, train_cls_1_acc = score['Acc'] fmt_str = "Iter [{:d}/{:d}] train Loss: {:.4f} Time/Image: {:.4f},\n0:{:.4f}\n1:{:.4f}" print_str = fmt_str.format(it, train_iter, loss.item(), #extracts the loss’s value as a Python float. train_time_meter.avg / cfg.train.batch_size,train_cls_0_acc, train_cls_1_acc) runing_metrics_train.reset() train_time_meter.reset() logger.info(print_str) writer.add_scalar('loss/train_loss', loss.item(), it) writer.add_scalars('metrics/train', {'cls_0':train_cls_0_acc, 'cls_1':train_cls_1_acc}, it) # writer.add_scalar('train_metrics/acc/cls_0', train_cls_0_acc, it) # writer.add_scalar('train_metrics/acc/cls_1', train_cls_1_acc, it) if it % cfg.train.val_interval == 0 or \ it == train_iter: val_start_time = time.time() model.eval() # change behavior like drop out with torch.no_grad(): # disable autograd, save memory usage for (file_a_val, file_b_val, label_val, mask_val) in valloader: file_a_val = file_a_val.to(device) file_b_val = file_b_val.to(device) outputs = model(file_a_val, file_b_val) # tensor.max() returns the maximum value and its indices pred = outputs.max(1)[1].cpu().numpy() running_metrics_val.update(label_val.numpy(), pred, mask_val.numpy()) label_val = label_val.to(device) mask_val = mask_val.to(device) val_loss = loss_fn(input=outputs, target=label_val, mask=mask_val) val_loss_meter.update(val_loss.item()) score, _ = running_metrics_val.get_scores() val_cls_0_acc, val_cls_1_acc = score['Acc'] writer.add_scalar('loss/val_loss', val_loss_meter.avg, it) logger.info(f"Iter [{it}/{train_iter}], val Loss: {val_loss_meter.avg:.4f} Time/Image: {(time.time()-val_start_time)/len(v_loader):.4f}\n0: {val_cls_0_acc:.4f}\n1:{val_cls_1_acc:.4f}") # lr_now = optimizer.param_groups[0]['lr'] # logger.info(f'lr: {lr_now}') # writer.add_scalar('lr', lr_now, it+1) logger.info('0: {:.4f}\n1:{:.4f}'.format(val_cls_0_acc, val_cls_1_acc)) writer.add_scalars('metrics/val', {'cls_0':val_cls_0_acc, 'cls_1':val_cls_1_acc}, it) # writer.add_scalar('val_metrics/acc/cls_0', val_cls_0_acc, it) # writer.add_scalar('val_metrics/acc/cls_1', val_cls_1_acc, it) val_loss_meter.reset() running_metrics_val.reset() # OA=score["Overall_Acc"] val_macro_OA = (val_cls_0_acc+val_cls_1_acc)/2 if val_macro_OA >= best_macro_OA_now and it>200: best_macro_OA_now = val_macro_OA best_macro_OA_iter_now = it state = { "epoch": it, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_macro_OA_now": best_macro_OA_now, 'best_macro_OA_iter_now':best_macro_OA_iter_now, } save_path = os.path.join(writer.file_writer.get_logdir(), "{}_{}_best_model.pkl".format(cfg.model.arch,cfg.data.dataloader)) torch.save(state, save_path) logger.info("best OA now = %.8f" % (best_macro_OA_now)) logger.info("best OA iter now= %d" % (best_macro_OA_iter_now)) train_val_time = time.time() - train_val_start_time remain_time = train_val_time * (train_iter-it) / it m, s = divmod(remain_time, 60) h, m = divmod(m, 60) if s != 0: train_time = "Remain train time = %d hours %d minutes %d seconds \n" % (h, m, s) else: train_time = "Remain train time : train completed.\n" logger.info(train_time) model.train() train_start_time = time.time() logger.info("best OA now = %.8f" % (best_macro_OA_now)) logger.info("best OA iter now= %d" % (best_macro_OA_iter_now)) state = { "epoch": it, "model_state": model.state_dict(), "optimizer_state": optimizer.state_dict(), "scheduler_state": scheduler.state_dict(), "best_macro_OA_now": best_macro_OA_now, 'best_macro_OA_iter_now':best_macro_OA_iter_now, } save_path = os.path.join(writer.file_writer.get_logdir(), "{}_{}_last_model.pkl".format(cfg.model.arch, cfg.data.dataloader)) torch.save(state, save_path)
class task2_train(): def __init__(self, device, train_loader, val_loader, model, MODEL_PATH): self.device = device self.model_path = MODEL_PATH self.train_loader = train_loader self.val_loader = val_loader self.model = model.to(device) base_optimizer = optim.Adam(self.model.parameters(), lr=2e-4) self.optimizer = LARS(optimizer=base_optimizer, eps=1e-8, trust_coef=0.001) self.criterion = nn.BCEWithLogitsLoss() self.scaler = GradScaler() def train(self): self.model.train() train_loss = 0 label_pred = [] label_true = [] AC_total, SE_total, SP_total, DC_total, JS_total = 0, 0, 0, 0, 0 start = time.time() for batch_idx, (inputs, GT) in enumerate(self.train_loader): if batch_idx % 10 == 0 and batch_idx: end = time.time() print("finished training %s images, using %.4fs" % (str(batch_idx * 8), end - start)) inputs, GT = inputs.to(self.device), GT.to(self.device) self.optimizer.zero_grad() with autocast(): SR = self.model(inputs) loss = loss_f(inputs=SR, targets=GT.long()) self.scaler.scale(loss).backward() self.scaler.step(self.optimizer) self.scaler.update() train_loss += loss.item() label_pred.append(SR.data.max(dim=1)[1].cpu().numpy()) label_true.append(GT.data.cpu().numpy()) label_pred = np.concatenate(label_pred, axis=0) label_true = np.concatenate(label_true, axis=0) for lbt, lbp in zip(label_true, label_pred): AC, SE, SP, DC, JS = self.evaluate(lbt, lbp) AC_total += AC SE_total += SE SP_total += SP DC_total += DC JS_total += JS print( 'Training Loss: %.4f\nAC: %.4f, SE: %.4f, SP: %.4f, DC: %.4f, JS: %.4f' % (train_loss, AC_total / len(label_true), SE_total / len(label_true), SP_total / len(label_true), DC_total / len(label_true), JS_total / len(label_true))) def _fast_hist(self, truth, pred): mask = (truth >= 0) & (truth < 19) hist = np.bincount(19 * truth[mask].astype(int) + pred[mask], minlength=19**2).reshape(19, 19) return hist def evaluate(self, ground_truth, predictions, smooth=1): confusion_matrix = np.zeros((19, 19)) for lt, lp in zip(ground_truth, predictions): confusion_matrix += self._fast_hist(lt.flatten(), lp.flatten()) fn = confusion_matrix.sum(1) - np.diag(confusion_matrix) fp = confusion_matrix.sum(0) - np.diag(confusion_matrix) tp = np.diag(confusion_matrix) tn = np.array([confusion_matrix.sum() for i in range(19) ]) - confusion_matrix.sum(1) - confusion_matrix.sum( 0) + np.diag(confusion_matrix) AC_array = (tp + tn) / np.maximum(1.0, fn + fp + tp + tn) AC = AC_array.mean() SE_array = (tp) / np.maximum(1.0, confusion_matrix.sum(1)) SE = SE_array.mean() SP_array = tn / np.maximum(1.0, tn + fp) SP = SP_array.mean() DC_array = (2 * tp) / np.maximum( 1.0, confusion_matrix.sum(0) + confusion_matrix.sum(1)) DC = DC_array.mean() JS_array = (tp) / np.maximum( 1.0, confusion_matrix.sum(0) + confusion_matrix.sum(1) - np.diag(confusion_matrix)) JS = JS_array.mean() return AC, SE, SP, DC, JS def saveModel(self): torch.save(self.model.state_dict(), self.model_path + "model.pth")
for epoch in range(100): progress.update(t_epoch, advance=1) progress.reset(t_ds) progress.reset(t_test) for images, labels in dl: images = images.to(config.device).expand(-1, 3, -1, -1) labels = labels.to(config.device) classes = eff_net(images) loss = cross_entropy(classes, labels) losses.append(loss.item()) progress.update(t_ds, advance=1, description=f'[magenta] {mean(losses):.5f}') optim.zero_grad() loss.backward() optim.step() correct = 0 total = 0 for images, labels in test: images = images.to(config.device).expand(-1, 3, -1, -1) labels = labels.to(config.device) classes = eff_net(images) classes = torch.argmax(classes, dim=1) for label, cls in zip(classes, labels): if label == cls: correct += 1 total += 1 progress.update(t_test,