class Trainer(object): def __init__(self, args): self.args = args # Define Saver self.saver = Saver(args) self.saver.save_experiment_config() # Define Tensorboard Summary self.summary = TensorboardSummary(self.saver.experiment_dir) self.writer = self.summary.create_summary() # Define Dataloader kwargs = {'num_workers': args.workers, 'pin_memory': True} #self.train_loader1, self.train_loader2, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs) self.train_loader1, self.train_loader2, self.val_loader, self.nclass = make_data_loader(args, **kwargs) # Define Criterion # whether to use class balanced weights if args.use_balanced_weights: classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy') if os.path.isfile(classes_weights_path): weight = np.load(classes_weights_path) else: weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass) weight = torch.from_numpy(weight.astype(np.float32)) else: weight = None self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type) # Define network model = AutoDeeplab (self.nclass, 12, self.criterion, crop_size=self.args.crop_size) optimizer = torch.optim.SGD( model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay ) self.model, self.optimizer = model, optimizer # Using cuda if args.cuda: self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids) patch_replication_callback(self.model) self.model = self.model.cuda() print ('cuda finished') # Define Optimizer self.model, self.optimizer = model, optimizer # Define Evaluator self.evaluator = Evaluator(self.nclass) # Define lr scheduler self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr, args.epochs, len(self.train_loader1)) self.architect = Architect (self.model, args) # Resuming checkpoint self.best_pred = 0.0 if args.resume is not None: if not os.path.isfile(args.resume): raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] if args.cuda: self.model.load_state_dict(checkpoint['state_dict']) else: self.model.load_state_dict(checkpoint['state_dict']) if not args.ft: self.optimizer.load_state_dict(checkpoint['optimizer']) self.best_pred = checkpoint['best_pred'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) # Clear start epoch if fine-tuning if args.ft: args.start_epoch = 0 def training(self, epoch): train_loss = 0.0 self.model.train() tbar = tqdm(self.train_loader1) num_img_tr = len(self.train_loader1) for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] search = next (iter (self.train_loader2)) image_search, target_search = search['image'], search['label'] # print ('------------------------begin-----------------------') if self.args.cuda: image, target = image.cuda(), target.cuda() image_search, target_search = image_search.cuda (), target_search.cuda () # print ('cuda finish') self.scheduler(self.optimizer, i, epoch, self.best_pred) self.optimizer.zero_grad() output = self.model(image) torch.cuda.empty_cache() loss = self.criterion(output, target) loss.backward() self.optimizer.step() if epoch>19: self.architect.step (image_search, target_search) train_loss += loss.item() tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1))) self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch) # Show 10 * 3 inference results each epoch if i % (num_img_tr // 10) == 0: global_step = i + num_img_tr * epoch self.summary.visualize_image(self.writer, self.args.dataset, image, target, output, global_step) self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch) print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print('Loss: %.3f' % train_loss) if self.args.no_val: # save checkpoint every epoch is_best = False self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best) def validation(self, epoch): self.model.eval() self.evaluator.reset() tbar = tqdm(self.val_loader, desc='\r') test_loss = 0.0 for i, sample in enumerate(tbar): image, target = sample['image'], sample['label'] if self.args.cuda: image, target = image.cuda(), target.cuda() with torch.no_grad(): output = self.model(image) loss = self.criterion(output, target) test_loss += loss.item() tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1))) pred = output.data.cpu().numpy() target = target.cpu().numpy() pred = np.argmax(pred, axis=1) # Add batch sample into evaluator self.evaluator.add_batch(target, pred) # Fast test during the training Acc = self.evaluator.Pixel_Accuracy() Acc_class = self.evaluator.Pixel_Accuracy_Class() mIoU = self.evaluator.Mean_Intersection_over_Union() FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union() self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch) self.writer.add_scalar('val/mIoU', mIoU, epoch) self.writer.add_scalar('val/Acc', Acc, epoch) self.writer.add_scalar('val/Acc_class', Acc_class, epoch) self.writer.add_scalar('val/fwIoU', FWIoU, epoch) print('Validation:') print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0])) print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU)) print('Loss: %.3f' % test_loss) new_pred = mIoU if new_pred > self.best_pred: is_best = True self.best_pred = new_pred self.saver.save_checkpoint({ 'epoch': epoch + 1, 'state_dict': self.model.state_dict(), 'optimizer': self.optimizer.state_dict(), 'best_pred': self.best_pred, }, is_best)
def search(self, train_x, train_y, valid_x, valid_y, metadata): np.random.seed(self.seed) cudnn.benchmark = True torch.manual_seed(self.seed) cudnn.enabled = True torch.cuda.manual_seed(self.seed) helpers.helper_function() n_classes = metadata['n_classes'] # reshape it to this dataset # model = torchvision.models.resnet18() # model.conv1 = nn.Conv2d(train_x.shape[1], 64, kernel_size=(7, 7), stride=1, padding=3) # model.fc = nn.Linear(model.fc.in_features, n_classes, bias=True) # return model if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) cudnn.benchmark = True cudnn.enabled = True criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() model = Network(self.init_channels, n_classes, self.layers, criterion) model = model.cuda() logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) optimizer = torch.optim.SGD(model.parameters(), self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(self.epochs), eta_min=self.learning_rate_min) architect = Architect(model) train_pack = list(zip(train_x, train_y)) valid_pack = list(zip(valid_x, valid_y)) train_loader = torch.utils.data.DataLoader(train_pack, int(self.batch_size), shuffle=False) valid_loader = torch.utils.data.DataLoader(valid_pack, int(self.batch_size)) for epoch in range(self.epochs): scheduler.step() lr = scheduler.get_lr()[0] logging.info('epoch %d lr %e', epoch, lr) genotype = model.genotype() logging.info('genotype = %s', genotype) # print(F.softmax(model.alphas_normal, dim=-1)) # print(F.softmax(model.alphas_reduce, dim=-1)) # training print("++++++Start training+++++++") for step, (input, target) in enumerate(train_loader): model.train() n = input.size(0) input = Variable(input, requires_grad=False).cuda() target = Variable(target, requires_grad=False).cuda(non_blocking=True) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_loader)) input_search = Variable(input_search, requires_grad=False).cuda() target_search = Variable( target_search, requires_grad=False).cuda(non_blocking=True) architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=self.unrolled) optimizer.zero_grad() logits = model(input) loss = criterion(logits, target) loss.backward() nn.utils.clip_grad_norm(model.parameters(), self.grad_clip) optimizer.step() if step % self.report_freq == 0: prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) print(step, loss, prec1, prec5) # validation print("++++++Start validation+++++++") with torch.no_grad(): for step, (input, target) in enumerate(valid_loader): input = Variable(input).cuda() target = Variable(target).cuda(non_blocking=True) model.eval() logits = model(input) loss = criterion(logits, target) if step % self.report_freq == 0: prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) print(step, loss, prec1, prec5) return model
def train(): use_gpu = cfg.MODEL.DEVICE == "cuda" # 1、make dataloader train_loader, val_loader, test_loader, num_query, num_class = darts_make_data_loader( cfg) # print(num_query, num_class) # 2、make model model = CDNetwork(num_class, cfg) # tensor = torch.randn(2, 3, 256, 128) # res = model(tensor) # print(res[0].size()) [2, 751] # 3、make optimizer optimizer = make_optimizer(cfg, model) # 4、make lr scheduler lr_scheduler = make_lr_scheduler(cfg, optimizer) # 5、make loss loss_fn = darts_make_loss(cfg) model._set_loss(loss_fn, compute_loss_acc) # 6、make architect architect = Architect(model, cfg) # get parameters device = cfg.MODEL.DEVICE use_gpu = device == "cuda" pretrained = cfg.MODEL.PRETRAINED != "" log_period = cfg.OUTPUT.LOG_PERIOD ckpt_period = cfg.OUTPUT.CKPT_PERIOD eval_period = cfg.OUTPUT.EVAL_PERIOD output_dir = cfg.OUTPUT.DIRS ckpt_save_path = output_dir + cfg.OUTPUT.CKPT_DIRS epochs = cfg.SOLVER.MAX_EPOCHS batch_size = cfg.SOLVER.BATCH_SIZE grad_clip = cfg.SOLVER.GRAD_CLIP batch_num = len(train_loader) log_iters = batch_num // log_period if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) # create *_result.xlsx # save the result for analyze name = (cfg.OUTPUT.LOG_NAME).split(".")[0] + ".xlsx" result_path = cfg.OUTPUT.DIRS + name wb = xl.Workbook() sheet = wb.worksheets[0] titles = [ 'size/M', 'speed/ms', 'final_planes', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss', 'acc', 'mAP', 'r1', 'r5', 'r10', 'loss' ] sheet.append(titles) check_epochs = [40, 80, 120, 160, 200, 240, 280, 320, 360, epochs] values = [] logger = logging.getLogger("CDNet_Search.train") size = count_parameters(model) values.append(format(size, '.2f')) values.append(model.final_planes) logger.info("the param number of the model is {:.2f} M".format(size)) logger.info("Starting Search CDNetwork") best_mAP, best_r1 = 0., 0. is_best = False avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter() avg_time, global_avg_time = AverageMeter(), AverageMeter() if use_gpu: model = model.to(device) if pretrained: logger.info("load self pretrained chekpoint to init") model.load_pretrained_model(cfg.MODEL.PRETRAINED) else: logger.info("use kaiming init to init the model") model.kaiming_init_() for epoch in range(epochs): lr_scheduler.step() lr = lr_scheduler.get_lr()[0] # architect lr.step architect.lr_scheduler.step() # if save epoch_num k, then run k+1 epoch next if pretrained and epoch < model.start_epoch: continue # print(epoch) # exit(1) model.train() avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch val_imgs, val_labels = next(iter(val_loader)) if use_gpu: imgs = imgs.to(device) labels = labels.to(device) val_imgs = val_imgs.to(device) val_labels = val_labels.to(device) # 1、update alpha architect.step(imgs, labels, val_imgs, val_labels, lr, optimizer, unrolled=cfg.SOLVER.UNROLLED) optimizer.zero_grad() res = model(imgs) # loss = loss_fn(scores, feats, labels) loss, acc = compute_loss_acc(res, labels, loss_fn) loss.backward() if grad_clip != 0: nn.utils.clip_grad_norm(model.parameters(), grad_clip) # 2、update weights optimizer.step() # compute the acc # acc = (scores.max(1)[1] == labels).float().mean() t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) # log info if (i + 1) % log_iters == 0: logger.info( "epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}". format(epoch + 1, i + 1, batch_num, avg_loss.avg, avg_acc.avg)) logger.info( "end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms". format(epoch + 1, epochs, lr, avg_time.avg * 1000)) global_avg_time.update(avg_time.avg) # test the model if (epoch + 1) % eval_period == 0 or (epoch + 1) in check_epochs: model.eval() metrics = R1_mAP(num_query, use_gpu=use_gpu) with torch.no_grad(): for vi, batch in enumerate(test_loader): # break # print(len(batch)) imgs, labels, camids = batch if use_gpu: imgs = imgs.to(device) feats = model(imgs) metrics.update((feats, labels, camids)) #compute cmc and mAP cmc, mAP = metrics.compute() logger.info("validation results at epoch {}".format(epoch + 1)) logger.info("mAP:{:2%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.2%}".format( r, cmc[r - 1])) # determine whether current model is the best if mAP > best_mAP: is_best = True best_mAP = mAP logger.info("Get a new best mAP") if cmc[0] > best_r1: is_best = True best_r1 = cmc[0] logger.info("Get a new best r1") # add the result to sheet if (epoch + 1) in check_epochs: val = [avg_acc.avg, mAP, cmc[0], cmc[4], cmc[9]] change = [format(v * 100, '.2f') for v in val] change.append(format(avg_loss.avg, '.3f')) values.extend(change) # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: torch.save(model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) model._parse_genotype(file=ckpt_save_path + "genotype_{}.json".format(epoch + 1)) logger.info("checkpoint {} was saved".format(epoch + 1)) if is_best: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") model._parse_genotype(file=ckpt_save_path + "best_genotype.json") logger.info("best_checkpoint was saved") is_best = False # exit(1) values.insert(1, format(global_avg_time.avg * 1000, '.2f')) sheet.append(values) wb.save(result_path) logger.info("Ending Search CDNetwork")
else: device = torch.device('cuda:{}'.format(args.gpu)) for step, (input, target) in enumerate(train_queue):#每个step取出一个batch,batchsize是64(256个数据对) model.train() n = input.size(0) input = input.to(device) target = target.to(device) # get a random minibatch from the search queue with replacement input_search, target_search = next(iter(valid_queue)) input_search = input_search.to(device) target_search = target_search.to(device) architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled) optimizer.zero_grad() logits = model(input) logits = logits.to(device) loss = criterion(logits, target) evaluater = Evaluator(dataset_classes) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) optimizer.step() #prec = utils.Accuracy(logits, target) #prec1 = utils.MIoU(logits, target, dataset_classes) evaluater.add_batch(target,logits) miou = evaluater.Mean_Intersection_over_Union() fscore = evaluater.Fx_Score()
def train(cfg): use_gpu = cfg.device == 'cuda' # 1、make dataloader train_loader, val_loader, test_loader, num_query, num_class = darts_make_data_loader(cfg) # print(num_query) # 2、make model if cfg.model_name == 'ssnet': model = SSNetwork(num_class, cfg, use_gpu) elif cfg.model_name == 'fsnet': model = FSNetwork(num_class, cfg.in_planes, cfg.init_size, cfg.layers, use_gpu, cfg.pretrained) # 3、make optimizer optimizer = darts_make_optimizer(cfg, model) # print(optimizer) # 4、make lr scheduler lr_scheduler = darts_make_lr_scheduler(cfg, optimizer) # print(lr_scheduler) # 5、make loss loss_func = darts_make_loss(cfg) model._set_loss(loss_func, compute_loss_acc) # 6、make architect architect = Architect(model, cfg) # get parameters log_period = cfg.log_period ckpt_period = cfg.ckpt_period eval_period = cfg.eval_period output_dir = cfg.output_dir device = cfg.device epochs = cfg.max_epochs ckpt_save_path = output_dir + cfg.ckpt_dir use_gpu = device == 'cuda' batch_size = cfg.batch_size batch_num = len(train_loader) log_iters = batch_num // log_period pretrained = cfg.pretrained is not None parallel = False use_neck = cfg.use_neck if not os.path.exists(ckpt_save_path): os.makedirs(ckpt_save_path) logger = logging.getLogger("DARTS.train") size = count_parameters(model) logger.info("the param number of the model is {:.2f} M".format(size)) logger.info("Start training") if pretrained: start_epoch = model.start_epoch if parallel: model = nn.DataParallel(model) if use_gpu: model = model.to(device) best_mAP, best_r1 = 0., 0. is_best = False avg_loss, avg_acc = RunningAverageMeter(), RunningAverageMeter() avg_time = AverageMeter() # num = 3 -> epoch = 2 for epoch in range(epochs): lr_scheduler.step() lr = lr_scheduler.get_lr()[0] # architect lr.step architect.lr_scheduler.step() if pretrained and epoch < model.start_epoch : continue model.train() avg_loss.reset() avg_acc.reset() avg_time.reset() for i, batch in enumerate(train_loader): t0 = time.time() imgs, labels = batch val_imgs, val_labels = next(iter(val_loader)) if use_gpu: imgs = imgs.to(device) labels = labels.to(device) val_imgs = val_imgs.to(device) val_labels = val_labels.to(device) # 1、update alpha architect.step(imgs, labels, val_imgs, val_labels, lr, optimizer, unrolled = cfg.unrolled) optimizer.zero_grad() res = model(imgs) # loss = loss_func(score, feats, labels) loss, acc = compute_loss_acc(use_neck, res, labels, loss_func) # print("loss:",loss.item()) loss.backward() nn.utils.clip_grad_norm(model.parameters(), cfg.grad_clip) # 2、update weights optimizer.step() # acc = (score.max(1)[1] == labels).float().mean() # print("acc:", acc) t1 = time.time() avg_time.update((t1 - t0) / batch_size) avg_loss.update(loss) avg_acc.update(acc) # log info if (i+1) % log_iters == 0: logger.info("epoch {}: {}/{} with loss is {:.5f} and acc is {:.3f}".format( epoch+1, i+1, batch_num, avg_loss.avg, avg_acc.avg)) logger.info("end epochs {}/{} with lr: {:.5f} and avg_time is: {:.3f} ms".format(epoch+1, epochs, lr, avg_time.avg * 1000)) # test the model if (epoch + 1) % eval_period == 0: model.eval() metrics = R1_mAP(num_query, use_gpu = use_gpu) with torch.no_grad(): for vi, batch in enumerate(test_loader): imgs, labels, camids = batch if use_gpu: imgs = imgs.to(device) feats = model(imgs) metrics.update((feats, labels, camids)) # compute cmc and mAP cmc, mAP = metrics.compute() logger.info("validation results at epoch {}".format(epoch + 1)) logger.info("mAP:{:2%}".format(mAP)) for r in [1,5,10]: logger.info("CMC curve, Rank-{:<3}:{:.2%}".format(r, cmc[r-1])) # determine whether current model is the best if mAP > best_mAP: is_best = True best_mAP = mAP logger.info("Get a new best mAP") if cmc[0] > best_r1: is_best = True best_r1 = cmc[0] logger.info("Get a new best r1") # whether to save the model if (epoch + 1) % ckpt_period == 0 or is_best: if parallel: torch.save(model.module.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) model.module._parse_genotype(file = ckpt_save_path + "genotype_{}.json".format(epoch + 1)) else: torch.save(model.state_dict(), ckpt_save_path + "checkpoint_{}.pth".format(epoch + 1)) model._parse_genotype(file = ckpt_save_path + "genotype_{}.json".format(epoch + 1)) logger.info("checkpoint {} was saved".format(epoch + 1)) if is_best: if parallel: torch.save(model.module.state_dict(), ckpt_save_path + "best_ckpt.pth") model.module._parse_genotype(file = ckpt_save_path + "best_genotype.json") else: torch.save(model.state_dict(), ckpt_save_path + "best_ckpt.pth") model._parse_genotype(file = ckpt_save_path + "best_genotype.json") logger.info("best_checkpoint was saved") is_best = False logger.info("training is end")