def my_test(): writer_image = SummaryWriter(os.path.join(args.summary_dir, 'segtest')) model = MultiTaskCNN(38, depth_channel=1, pretrained=False, arch='resnet50', use_aspp=False) load_ckpt(model, None, args.last_ckpt, device) model.eval() model = model.to(device) val_data = my_data_eval.ReadData(transform=torchvision.transforms.Compose( [my_data_eval.scaleNorm(), my_data_eval.ToTensor(), Normalize()]), data_dir=args.data_dir) val_loader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=4, pin_memory=False) with torch.no_grad(): for batch_idx, sample in enumerate(val_loader): # origin_image = sample['origin_image'].numpy() # origin_depth = sample['origin_depth'].numpy() image = sample['image'].to(device) depth = sample['depth'].to(device) with torch.no_grad(): time1 = time.time() pred = model(image, depth) time2 = time.time() print('推理时间:', time2 - time1, '\nFPS: ', 1 / (time2 - time1)) output = torch.max(pred, 1)[1] # output = output.squeeze(0).cpu().numpy() output = output.cpu().numpy() grid_image1 = make_grid(image[:1].clone().cpu().data, 1, normalize=True) writer_image.add_image('image', grid_image1, batch_idx) grid_image2 = make_grid(depth[:1].clone().cpu().data, 1, normalize=True) writer_image.add_image('depth', grid_image2, batch_idx) grid_image3 = make_grid(utils.color_label( torch.max(pred[:1], 1)[1]), 1, normalize=False, range=(0, 255)) writer_image.add_image('Predicted label', grid_image3, batch_idx)
def inference(): model = RedNet_model.RedNet(pretrained=False) load_ckpt(model, None, args.last_ckpt, device) model.eval() model.to(device) image = imageio.imread(args.rgb) depth = imageio.imread(args.depth) # Bi-linear image = skimage.transform.resize(image, (image_h, image_w), order=1, mode='reflect', preserve_range=True) # Nearest-neighbor depth = skimage.transform.resize(depth, (image_h, image_w), order=0, mode='reflect', preserve_range=True) image = image / 255 image = torch.from_numpy(image).float() depth = torch.from_numpy(depth).float() image = image.permute(2, 0, 1) depth.unsqueeze_(0) image = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(image) depth = torchvision.transforms.Normalize(mean=[19050], std=[9650])(depth) image = image.to(device).unsqueeze_(0) depth = depth.to(device).unsqueeze_(0) pred = model(image, depth) output = utils.color_label(torch.max(pred, 1)[1] + 1)[0] imageio.imsave(args.output, output.cpu().numpy().transpose((1, 2, 0)))
def train(): train_data = ACNet_data.SUNRGBD(transform=transforms.Compose([ACNet_data.scaleNorm(), ACNet_data.RandomScale((1.0, 1.4)), ACNet_data.RandomHSV((0.9, 1.1), (0.9, 1.1), (25, 25)), ACNet_data.RandomCrop(image_h, image_w), ACNet_data.RandomFlip(), ACNet_data.ToTensor(), ACNet_data.Normalize()]), phase_train=True, data_dir=args.data_dir) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False) num_train = len(train_data) if args.last_ckpt: model = ACNet_models_V1.ACNet(num_class=40, pretrained=False) else: model = ACNet_models_V1.ACNet(num_class=40, pretrained=True) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) CEL_weighted = utils.CrossEntropyLoss2d(weight=nyuv2_frq) model.train() model.to(device) CEL_weighted.to(device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) global_step = 0 if args.last_ckpt: global_step, args.start_epoch = load_ckpt(model, optimizer, args.last_ckpt, device) lr_decay_lambda = lambda epoch: args.lr_decay_rate ** (epoch // args.lr_epoch_per_decay) scheduler = LambdaLR(optimizer, lr_lambda=lr_decay_lambda) writer = SummaryWriter(args.summary_dir) for epoch in range(int(args.start_epoch), args.epochs): scheduler.step(epoch) local_count = 0 last_count = 0 end_time = time.time() if epoch % args.save_epoch_freq == 0 and epoch != args.start_epoch: save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train) for batch_idx, sample in enumerate(train_loader): image = sample['image'].to(device) depth = sample['depth'].to(device) target_scales = [sample[s].to(device) for s in ['label', 'label2', 'label3', 'label4', 'label5']] optimizer.zero_grad() pred_scales = model(image, depth, args.checkpoint) loss = CEL_weighted(pred_scales, target_scales) loss.backward() optimizer.step() local_count += image.data.shape[0] global_step += 1 if global_step % args.print_freq == 0 or global_step == 1: time_inter = time.time() - end_time count_inter = local_count - last_count print_log(global_step, epoch, local_count, count_inter, num_train, loss, time_inter) end_time = time.time() for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), global_step, bins='doane') grid_image = make_grid(image[:3].clone().cpu().data, 3, normalize=True) writer.add_image('image', grid_image, global_step) grid_image = make_grid(depth[:3].clone().cpu().data, 3, normalize=True) writer.add_image('depth', grid_image, global_step) grid_image = make_grid(utils.color_label(torch.max(pred_scales[0][:3], 1)[1] + 1), 3, normalize=False, range=(0, 255)) writer.add_image('Predicted label', grid_image, global_step) grid_image = make_grid(utils.color_label(target_scales[0][:3]), 3, normalize=False, range=(0, 255)) writer.add_image('Groundtruth label', grid_image, global_step) writer.add_scalar('CrossEntropyLoss', loss.data, global_step=global_step) writer.add_scalar('Learning rate', scheduler.get_lr()[0], global_step=global_step) last_count = local_count save_ckpt(args.ckpt_dir, model, optimizer, global_step, args.epochs, 0, num_train) print("Training completed ")
def train(): # 记录数据在tensorboard中显示 writer_loss = SummaryWriter(os.path.join(args.summary_dir, 'loss')) # writer_loss1 = SummaryWriter(os.path.join(args.summary_dir, 'loss', 'loss1')) # writer_loss2 = SummaryWriter(os.path.join(args.summary_dir, 'loss', 'loss2')) # writer_loss3 = SummaryWriter(os.path.join(args.summary_dir, 'loss', 'loss3')) writer_acc = SummaryWriter(os.path.join(args.summary_dir, 'macc')) # 准备数据集 train_data = data_eval.ReadData(transform=transforms.Compose([ data_eval.scaleNorm(), data_eval.RandomScale((1.0, 1.4)), data_eval.RandomHSV((0.9, 1.1), (0.9, 1.1), (25, 25)), data_eval.RandomCrop(image_h, image_w), data_eval.RandomFlip(), data_eval.ToTensor(), data_eval.Normalize() ]), data_dir=args.train_data_dir) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) val_data = data_eval.ReadData(transform=transforms.Compose([ data_eval.scaleNorm(), data_eval.RandomScale((1.0, 1.4)), data_eval.RandomCrop(image_h, image_w), data_eval.ToTensor(), data_eval.Normalize() ]), data_dir=args.val_data_dir) val_loader = DataLoader(val_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) num_train = len(train_data) # num_val = len(val_data) # build model if args.last_ckpt: model = MultiTaskCNN_Atten(38, depth_channel=1, pretrained=False, arch='resnet50', use_aspp=True) else: model = MultiTaskCNN_Atten(38, depth_channel=1, pretrained=True, arch='resnet50', use_aspp=True) # build optimizer if args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(model.parameters(), args.lr) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.lr) else: # rmsprop print('not supported optimizer \n') return None global_step = 0 max_miou_val = 0 loss_count = 0 # 如果有模型的训练权重,则获取global_step,start_epoch if args.last_ckpt: global_step, args.start_epoch = load_ckpt(model, optimizer, args.last_ckpt, device) # if torch.cuda.device_count() > 1 and args.cuda and torch.cuda.is_available(): # print("Let's use", torch.cuda.device_count(), "GPUs!") # model = torch.nn.DataParallel(model).to(device) model = model.to(device) model.train() # cal_param(model, data) loss_func = nn.CrossEntropyLoss() for epoch in range(int(args.start_epoch), args.epochs): torch.cuda.empty_cache() # if epoch <= freeze_epoch: # for layer in [model.conv1, model.maxpool,model.layer1, model.layer2, model.layer3, model.layer4]: # for param in layer.parameters(): # param.requires_grad = False tq = tqdm(total=len(train_loader) * args.batch_size) if loss_count >= 10: args.lr = 0.5 * args.lr loss_count = 0 lr = poly_lr_scheduler(optimizer, args.lr, iter=epoch, max_iter=args.epochs) optimizer.param_groups[0]['lr'] = lr # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 30, gamma=0.5) tq.set_description('epoch %d, lr %f' % (epoch, args.lr)) loss_record = [] # loss1_record = [] # loss2_record = [] # loss3_record = [] local_count = 0 # print('1') for batch_idx, data in enumerate(train_loader): image = data['image'].to(device) depth = data['depth'].to(device) label = data['label'].long().to(device) # print('label', label.shape) output, output_sup1, output_sup2 = model(image, depth) loss1 = loss_func(output, label) loss2 = loss_func(output_sup1, label) loss3 = loss_func(output_sup2, label) loss = loss1 + loss2 + loss3 tq.update(args.batch_size) tq.set_postfix(loss='%.6f' % loss) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 local_count += image.data.shape[0] # writer_loss.add_scalar('loss_step', loss, global_step) # writer_loss1.add_scalar('loss1_step', loss1, global_step) # writer_loss2.add_scalar('loss2_step', loss2, global_step) # writer_loss3.add_scalar('loss3_step', loss3, global_step) loss_record.append(loss.item()) # loss1_record.append(loss1.item()) # loss2_record.append(loss2.item()) # loss3_record.append(loss3.item()) if global_step % args.print_freq == 0 or global_step == 1: for name, param in model.named_parameters(): writer_loss.add_histogram(name, param.clone().cpu().data.numpy(), global_step, bins='doane') writer_loss.add_graph(model, [image, depth]) grid_image1 = make_grid(image[:3].clone().cpu().data, 3, normalize=True) writer_loss.add_image('image', grid_image1, global_step) grid_image2 = make_grid(depth[:3].clone().cpu().data, 3, normalize=True) writer_loss.add_image('depth', grid_image2, global_step) grid_image3 = make_grid(utils.color_label( torch.max(output[:3], 1)[1]), 3, normalize=False, range=(0, 255)) writer_loss.add_image('Predicted label', grid_image3, global_step) grid_image4 = make_grid(utils.color_label(label[:3]), 3, normalize=False, range=(0, 255)) writer_loss.add_image('Groundtruth label', grid_image4, global_step) tq.close() loss_train_mean = np.mean(loss_record) with open(log_file, 'a') as f: f.write(str(epoch) + '\t' + str(loss_train_mean)) # loss1_train_mean = np.mean(loss1_record) # loss2_train_mean = np.mean(loss2_record) # loss3_train_mean = np.mean(loss3_record) writer_loss.add_scalar('epoch/loss_epoch_train', float(loss_train_mean), epoch) # writer_loss1.add_scalar('epoch/sub_loss_epoch_train', float(loss1_train_mean), epoch) # writer_loss2.add_scalar('epoch/sub_loss_epoch_train', float(loss2_train_mean), epoch) # writer_loss3.add_scalar('epoch/sub_loss_epoch_train', float(loss3_train_mean), epoch) print('loss for train : %f' % loss_train_mean) print('----validation starting----') # tq_val = tqdm(total=len(val_loader) * args.batch_size) # tq_val.set_description('epoch %d' % epoch) model.eval() val_total_time = 0 with torch.no_grad(): sys.stdout.flush() tbar = tqdm(val_loader) acc_meter = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() a_meter = AverageMeter() b_meter = AverageMeter() for batch_idx, sample in enumerate(tbar): # origin_image = sample['origin_image'].numpy() # origin_depth = sample['origin_depth'].numpy() image_val = sample['image'].to(device) depth_val = sample['depth'].to(device) label_val = sample['label'].numpy() with torch.no_grad(): start = time.time() pred = model(image_val, depth_val) end = time.time() duration = end - start val_total_time += duration # tq_val.set_postfix(fps ='%.4f' % (args.batch_size / (end - start))) print_str = 'Test step [{}/{}].'.format( batch_idx + 1, len(val_loader)) tbar.set_description(print_str) output_val = torch.max(pred, 1)[1] output_val = output_val.squeeze(0).cpu().numpy() acc, pix = accuracy(output_val, label_val) intersection, union = intersectionAndUnion( output_val, label_val, args.num_class) acc_meter.update(acc, pix) a_m, b_m = macc(output_val, label_val, args.num_class) intersection_meter.update(intersection) union_meter.update(union) a_meter.update(a_m) b_meter.update(b_m) fps = len(val_loader) / val_total_time print('fps = %.4f' % fps) tbar.close() mAcc = (a_meter.average() / (b_meter.average() + 1e-10)) with open(log_file, 'a') as f: f.write(' ' + str(mAcc.mean()) + '\n') iou = intersection_meter.sum / (union_meter.sum + 1e-10) writer_acc.add_scalar('epoch/Acc_epoch_train', mAcc.mean(), epoch) print('----validation finished----') model.train() # # 每隔save_epoch_freq个epoch就保存一次权重 if epoch != args.start_epoch: if iou.mean() >= max_miou_val: print('mIoU:', iou.mean()) if not os.path.isdir(args.ckpt_dir): os.mkdir(args.ckpt_dir) save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train) max_miou_val = iou.mean() # max_macc_val = mAcc.mean() else: loss_count += 1 torch.cuda.empty_cache()
def train(): train_dirs = [train_dir for train_dir in [args.train_dir, args.train_dir2] if train_dir is not None] train_data = ACNet_data.FreiburgForest( transform=transforms.Compose([ ACNet_data.ScaleNorm(), # ACNet_data.RandomRotate((-13, 13)), # ACNet_data.RandomSkew((-0.05, 0.10)), ACNet_data.RandomScale((1.0, 1.4)), ACNet_data.RandomHSV((0.9, 1.1), (0.9, 1.1), (25, 25)), ACNet_data.RandomCrop(image_h, image_w), ACNet_data.RandomFlip(), ACNet_data.ToTensor(), ACNet_data.Normalize() ]), data_dirs=train_dirs, modal1_name=args.modal1, modal2_name=args.modal2, ) valid_dirs = [valid_dir for valid_dir in [args.valid_dir, args.valid_dir2] if valid_dir is not None] valid_data = ACNet_data.FreiburgForest( transform=transforms.Compose([ ACNet_data.ScaleNorm(), ACNet_data.ToTensor(), ACNet_data.Normalize() ]), data_dirs=valid_dirs, modal1_name=args.modal1, modal2_name=args.modal2, ) ''' # Split dataset into training and validation dataset_length = len(data) valid_split = 0.05 # tiny split due to the small size of the dataset valid_length = int(valid_split * dataset_length) train_length = dataset_length - valid_length train_data, valid_data = torch.utils.data.random_split(data, [train_length, valid_length]) ''' train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False) valid_loader = DataLoader(valid_data, batch_size=args.batch_size * 3, shuffle=False, num_workers=1, pin_memory=False) # Initialize model if args.last_ckpt: model = ACNet_models_V1.ACNet(num_class=5, pretrained=False) else: model = ACNet_models_V1.ACNet(num_class=5, pretrained=True) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) model.train() model.to(device) # Initialize criterion, optimizer and scheduler criterion = utils.CrossEntropyLoss2d(weight=freiburgforest_frq) criterion.to(device) # TODO: try with different optimizers and schedulers (CyclicLR exp_range for example) # TODO: try with a smaller LR (currently loss decay is too steep and then doesn't change) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_decay_lambda = lambda epoch: args.lr_decay_rate ** (epoch // args.lr_epoch_per_decay) # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_decay_lambda) scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=int(np.ceil(args.epochs / 7)), T_mult=2, eta_min=8e-4) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.epochs, eta_min=5e-4) global_step = 0 # TODO: add early stop to avoid overfitting # Continue training from previous checkpoint if args.last_ckpt: global_step, args.start_epoch = utils.load_ckpt(model, optimizer, scheduler, args.last_ckpt, device) writer = SummaryWriter(args.summary_dir) losses = [] for epoch in tqdm(range(int(args.start_epoch), args.epochs)): if epoch % args.save_epoch_freq == 0 and epoch != args.start_epoch: utils.save_ckpt(args.ckpt_dir, model, optimizer, scheduler, global_step, epoch) for batch_idx, sample in enumerate(train_loader): modal1, modal2 = sample['modal1'].to(device), sample['modal2'].to(device) target_scales = [sample[s].to(device) for s in ['label', 'label2', 'label3', 'label4', 'label5']] optimizer.zero_grad() pred_scales = model(modal1, modal2, args.checkpoint) loss = criterion(pred_scales, target_scales) loss.backward() optimizer.step() losses.append(loss.item()) global_step += 1 if global_step % args.print_freq == 0 or global_step == 1: for name, param in model.named_parameters(): writer.add_histogram(name, param.detach().cpu().numpy(), global_step, bins='doane') grid_image = make_grid(modal1[:3].detach().cpu(), 3, normalize=False) writer.add_image('Modal1', grid_image, global_step) grid_image = make_grid(modal2[:3].detach().cpu(), 3, normalize=False) writer.add_image('Modal2', grid_image, global_step) grid_image = make_grid(utils.color_label(torch.argmax(pred_scales[0][:3], 1) + 1), 3, normalize=True, range=(0, 255)) writer.add_image('Prediction', grid_image, global_step) grid_image = make_grid(utils.color_label(target_scales[0][:3]), 3, normalize=True, range=(0, 255)) writer.add_image('GroundTruth', grid_image, global_step) writer.add_scalar('Loss', loss.item(), global_step=global_step) writer.add_scalar('Loss average', sum(losses) / len(losses), global_step=global_step) writer.add_scalar('Learning rate', scheduler.get_last_lr()[0], global_step=global_step) # Compute validation metrics with torch.no_grad(): model.eval() losses_val = [] acc_list = [] iou_list = [] for sample_val in valid_loader: modal1_val, modal2_val = sample_val['modal1'].to(device), sample_val['modal2'].to(device) target_val = sample_val['label'].to(device) pred_val = model(modal1_val, modal2_val) losses_val.append(criterion([pred_val], [target_val]).item()) acc_list.append(utils.accuracy( (torch.argmax(pred_val, 1) + 1).detach().cpu().numpy().astype(int), target_val.detach().cpu().numpy().astype(int))[0]) iou_list.append(utils.compute_IoU( y_pred=(torch.argmax(pred_val, 1) + 1).detach().cpu().numpy().astype(int), y_true=target_val.detach().cpu().numpy().astype(int), num_classes=5 )) writer.add_scalar('Loss validation', sum(losses_val) / len(losses_val), global_step=global_step) writer.add_scalar('Accuracy', sum(acc_list) / len(acc_list), global_step=global_step) iou = np.mean(np.stack(iou_list, axis=0), axis=0) writer.add_scalar('IoU_Road', iou[0], global_step=global_step) writer.add_scalar('IoU_Grass', iou[1], global_step=global_step) writer.add_scalar('IoU_Vegetation', iou[2], global_step=global_step) writer.add_scalar('IoU_Sky', iou[3], global_step=global_step) writer.add_scalar('IoU_Obstacle', iou[4], global_step=global_step) writer.add_scalar('mIoU', np.mean(iou), global_step=global_step) model.train() losses = [] scheduler.step() utils.save_ckpt(args.ckpt_dir, model, optimizer, scheduler, global_step, args.epochs) print("Training completed ")
def inference(): writer_image = SummaryWriter(os.path.join(args.summary_dir, 'segtest')) model = MultiTaskCNN(38, depth_channel=1, pretrained=False, arch='resnet50', use_aspp=False) load_ckpt(model, None, args.last_ckpt, device) model.eval() model = model.to(device) val_data = data_eval.ReadData(transform=torchvision.transforms.Compose( [data_eval.scaleNorm(), data_eval.ToTensor(), Normalize()]), data_dir=args.data_dir) val_loader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=4, pin_memory=False) acc_meter = AverageMeter() intersection_meter = AverageMeter() union_meter = AverageMeter() a_meter = AverageMeter() b_meter = AverageMeter() test_total_time = 0 with torch.no_grad(): for batch_idx, sample in enumerate(val_loader): # origin_image = sample['origin_image'].to(device) # origin_depth = sample['origin_depth'].to(device) image = sample['image'].to(device) depth = sample['depth'].to(device) label = sample['label'].numpy() show_label = sample['label'].long().to(device) with torch.no_grad(): time1 = time.time() pred = model(image, depth) time2 = time.time() test_total_time += (time2 - time1) output = torch.max(pred, 1)[1] # # output = output.squeeze(0).cpu().numpy() output = output.cpu().numpy() acc, pix = accuracy(output, label) intersection, union = intersectionAndUnion(output, label, args.num_class) acc_meter.update(acc, pix) a_m, b_m = macc(output, label, args.num_class) intersection_meter.update(intersection) union_meter.update(union) a_meter.update(a_m) b_meter.update(b_m) if batch_idx % 50 == 0: grid_image1 = make_grid(image[:1].clone().cpu().data, 1, normalize=True) writer_image.add_image('image', grid_image1, batch_idx) grid_image2 = make_grid(depth[:1].clone().cpu().data, 1, normalize=True) writer_image.add_image('depth', grid_image2, batch_idx) grid_image3 = make_grid(utils.color_label( torch.max(pred[:1], 1)[1]), 1, normalize=False, range=(0, 255)) writer_image.add_image('Predicted label', grid_image3, batch_idx) grid_image4 = make_grid(utils.color_label(show_label[:1]), 1, normalize=False, range=(0, 255)) writer_image.add_image('Groundtruth label', grid_image4, batch_idx) print('[{}] iter {}, accuracy: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), batch_idx, acc)) # if batch_idx % 1 == 0: # if args.visualize: # visualize_result(origin_image, origin_depth, label, output, batch_idx, args) # visualize_result(origin_image, origin_depth, label - 1, output - 1, batch_idx, args) print('推理时间:', test_total_time / len(val_data), '\nfps:', len(val_data) / test_total_time) iou = intersection_meter.sum / (union_meter.sum + 1e-10) for i, _iou in enumerate(iou): print('class [{}], IoU: {}'.format(i, _iou)) # mAcc:Prediction和Ground Truth对应位置的“分类”准确率(每个像素) mAcc = (a_meter.average() / (b_meter.average() + 1e-10)) print(mAcc.mean()) print('[Eval Summary]:') print('Mean IoU: {:.4}, Accuracy: {:.2f}%'.format( iou.mean(), acc_meter.average() * 100))
def train(): # 记录数据在tensorboard中显示 writer = SummaryWriter(args.summary_dir) # 准备数据集 train_data = data_eval.ReadData(transform=transforms.Compose([ data_eval.scaleNorm(), data_eval.RandomScale((1.0, 1.4)), data_eval.RandomHSV((0.9, 1.1), (0.9, 1.1), (25, 25)), data_eval.RandomCrop(image_h, image_w), data_eval.RandomFlip(), data_eval.ToTensor(), data_eval.Normalize() ]), data_dir=args.data_dir) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) num_train = len(train_data) # data = iter(train_loader).next() # print('data:', data['image'].shape) # build model if args.last_ckpt: model = MultiTaskCNN(38, depth_channel=1, pretrained=False, arch='resnet18') else: model = MultiTaskCNN(38, depth_channel=1, pretrained=True, arch='resnet18') model = model.to(device) # build optimizer if args.optimizer == 'rmsprop': optimizer = torch.optim.RMSprop(model.parameters(), args.lr) elif args.optimizer == 'sgd': optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.lr) else: # rmsprop print('not supported optimizer \n') return None global_step = 0 # 如果有模型的训练权重,则获取global_step,start_epoch if args.last_ckpt: global_step, args.start_epoch = load_ckpt(model, optimizer, args.last_ckpt, device) # # 监测使用哪几个GPU # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # # nn.DataParallel(module, device_ids=None, output_device=None, dim=0):使用多块GPU进行计算 # model = nn.DataParallel(model) model.train() # cal_param(model, data) loss_func = nn.CrossEntropyLoss(weight=weight.float()) for epoch in range(int(args.start_epoch), args.epochs): tq = tqdm(total=len(train_loader) * args.batch_size) lr = poly_lr_scheduler(optimizer, args.lr, iter=epoch, max_iter=args.epochs) tq.set_description('epoch %d, lr %f' % (epoch, lr)) loss_record = [] local_count = 0 # print('1') for batch_idx, data in enumerate(train_loader): # print(batch_idx) image = data['image'].to(device) depth = data['depth'].to(device) label = data['label'].long().to(device) # print('label', label.shape) output, output_sup1, output_sup2 = model(image, depth) loss1 = loss_func(output, label) loss2 = loss_func(output_sup1, label) loss3 = loss_func(output_sup2, label) loss = loss1 + loss2 + loss3 tq.update(args.batch_size) tq.set_postfix(loss='%.6f' % loss) optimizer.zero_grad() loss.backward() optimizer.step() global_step += 1 local_count += image.data.shape[0] writer.add_scalar('loss_step', loss, global_step) loss_record.append(loss.item()) if global_step % args.print_freq == 0 or global_step == 1: for name, param in model.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), global_step, bins='doane') writer.add_graph(model, [image, depth]) grid_image1 = make_grid(image[:3].clone().cpu().data, 3, normalize=True) writer.add_image('image', grid_image1, global_step) grid_image2 = make_grid(depth[:3].clone().cpu().data, 3, normalize=True) writer.add_image('depth', grid_image2, global_step) grid_image3 = make_grid(utils.color_label( torch.max(output[:3], 1)[1]), 3, normalize=False, range=(0, 255)) writer.add_image('Predicted label', grid_image3, global_step) grid_image4 = make_grid(utils.color_label(label[:3]), 3, normalize=False, range=(0, 255)) writer.add_image('Groundtruth label', grid_image4, global_step) tq.close() loss_train_mean = np.mean(loss_record) writer.add_scalar('epoch/loss_epoch_train', float(loss_train_mean), epoch) print('loss for train : %f' % loss_train_mean) # 每隔save_epoch_freq个epoch就保存一次权重 if epoch % args.save_epoch_freq == 0 and epoch != args.start_epoch: if not os.path.isdir(args.ckpt_dir): os.mkdir(args.ckpt_dir) save_ckpt(args.ckpt_dir, model, optimizer, global_step, epoch, local_count, num_train)