def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.use_stereo: opt.frame_ids.append("s") if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], opt.frame_ids, 4, is_train=False, tag=opt.dataset, load_meta=True, is_sep_train_seman=False) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], opt.frame_ids, 4, is_train=False, tag=opt.dataset) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() # x = torch.ones(2, 2, requires_grad=True) # print(x) # y = x + 2 + x # y = y.detach() # print(y) # z = y * y * 3 # out = z.mean() # print(z, out) # out.backward() # print(x.grad) ##--------------------Visualization parameter here----------------------------## sfx = torch.nn.Softmax(dim=1) mergeDisp = Merge_MultDisp(opt.scales, batchSize=opt.batch_size, isMulChannel=opt.isMulChannel) svRoot = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/figure_visual' index = 0 isvisualize = True viewEdgeMerge = False isHist = False useGtSeman = True viewSurfaceNormal = True viewSelfOcclu = True viewDispUp = True viewSmooth = True viewMulReg = True viewBorderRegress = False viewBorderSimilarity = False viewRandomSample = True viewSemanReg = False viewDepthGuess = False height = 256 width = 512 tensor23dPts = Tensor23dPts() if isHist: rec = np.zeros((19, 100)) if opt.isMulChannel: app = os.path.join('mulDispOn', opt.model_name) else: app = os.path.join('mulDispOff', opt.model_name) dirpath = os.path.join(svRoot, app) if not os.path.exists(dirpath): os.makedirs(dirpath) if viewEdgeMerge: comp1dgrad = Comp1dgrad().cuda() if viewSurfaceNormal: compsn = ComputeSurfaceNormal(height=height, width=width, batch_size=opt.batch_size).cuda() if viewSelfOcclu: selfclu = SelfOccluMask().cuda() with torch.no_grad(): for idx, inputs in enumerate(dataloader): # if idx != 12: # continue for key, ipt in inputs.items(): if not (key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta'): inputs[key] = ipt.to(torch.device("cuda")) input_color = inputs[("color", 0, 0)].cuda() # input_color = torch.flip(input_color, dims=[3]) features = encoder(input_color) outputs = dict() outputs.update( depth_decoder(features, computeSemantic=True, computeDepth=False)) outputs.update( depth_decoder(features, computeSemantic=False, computeDepth=True)) # view the processed semantic seperate training data # for viewInd in range(opt.batch_size): # label = inputs['semanTrain_label'] # visualize_semantic(label[viewInd, 0, :, :].cpu().numpy()).show() # fig_rgb = inputs['semanTrain_rgb'][viewInd, :, :, :].permute(1, 2, 0).cpu().numpy() # fig_rgb = (fig_rgb * 255).astype(np.uint8) # fig_rgb = pil.fromarray(fig_rgb) # fig_rgb.show() if isHist: mulDisp = outputs[('mul_disp', 0)] scaled_disp, mulDepth = disp_to_depth(mulDisp, 0.1, 100) mulDepth = mulDepth.cpu() for i in range(mulDisp.shape[1]): rec[i, :] += torch.histc(mulDepth[:, i, :, :], bins=100, min=0, max=100).numpy() if isvisualize: if useGtSeman: # outputs[('mul_disp', 0)][:,2,:,:] = outputs[('mul_disp', 0)][:,2,:,:] * 0 # outputs[('mul_disp', 0)][:, 12, :, :] = outputs[('mul_disp', 0)][:, 12, :, :] * 0 mergeDisp(inputs, outputs, eval=False) else: mergeDisp(inputs, outputs, eval=True) dispMap = outputs[('disp', 0)] scaled_disp, depthMap = disp_to_depth(dispMap, 0.1, 100) depthMap = depthMap * STEREO_SCALE_FACTOR # _, mul_depthMap = disp_to_depth(outputs[('mul_disp', 0)], 0.1, 100) # mul_depthMap = mul_depthMap * STEREO_SCALE_FACTOR if viewDispUp: fig_dispup = compDispUp.visualize(scaled_disp, viewindex=index) if viewSmooth: rgb = inputs[('color_aug', 0, 0)] smoothfig = comSmooth.visualize(rgb=rgb, disp=scaled_disp, viewindex=index) if useGtSeman: fig_seman = tensor2semantic(inputs['seman_gt'], ind=index, isGt=True) else: fig_seman = tensor2semantic(outputs[('seman', 0)], ind=index) if viewSemanReg: foregroundType = [ 11, 12, 13, 14, 15, 16, 17, 18 ] # person, rider, car, truck, bus, train, motorcycle, bicycle softmaxedSeman = F.softmax(outputs[('seman', 0)], dim=1) forePredMask = torch.sum( softmaxedSeman[:, foregroundType, :, :], dim=1, keepdim=True) foreGtMask = torch.ones(dispMap.shape).cuda().byte() for m in foregroundType: foreGtMask = foreGtMask * (inputs['seman_gt'] != m) foreGtMask = 1 - foreGtMask foreGtMask = foreGtMask.float() forePredMask[forePredMask > 0.5] = 1 forePredMask[forePredMask <= 0.5] = 0 forePredMask = foreGtMask rdSampleSeman.visualizeBorderSample(dispMap, forePredMask, gtMask=foreGtMask, viewIndex=index) cm = plt.get_cmap('magma') viewForePred = forePredMask[index, :, :, :].squeeze( 0).detach().cpu().numpy() viewForePred = (cm(viewForePred) * 255).astype(np.uint8) # pil.fromarray(viewForePred).show() viewForeGt = foreGtMask[index, :, :, :].squeeze( 0).detach().cpu().numpy() viewForeGt = (cm(viewForeGt) * 255).astype(np.uint8) # pil.fromarray(viewForeGt).show() forePredictCombined = np.concatenate( [viewForePred, viewForeGt], axis=0) # pil.fromarray(forePredictCombined).show() pil.fromarray(forePredictCombined).save( os.path.join(dirpath, str(idx) + '_fg.png')) if viewDepthGuess: wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle wallTypeMask = torch.ones(dispMap.shape).cuda().byte() roadTypeMask = torch.ones(dispMap.shape).cuda().byte() foreGroundMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in wallType: wallTypeMask = wallTypeMask * (inputs['seman_gt'] != m) wallTypeMask = (1 - wallTypeMask).float() for m in roadType: roadTypeMask = roadTypeMask * (inputs['seman_gt'] != m) roadTypeMask = (1 - roadTypeMask).float() for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = (1 - foreGroundMask).float() originalSieze = [2048, 1024] # currentSize = np.array([dispMap.shape[3], dispMap.shape[2]]) # scaleFac = np.eye(4) # scaleFac[0,0] = currentSize[0] / originalSieze[0] # scaleFac[1,1] = currentSize[1] / originalSieze[1] # scaleFac = torch.Tensor(scaleFac).view(1,4,4).repeat(opt.batch_size, 1, 1).cuda() # scaledIntrinsic = scaleFac @ inputs['realIn'] scaledIntrinsic = inputs['realIn'] depthGuess.visualizeDepthGuess( realDepth=depthMap, dispAct=dispMap, foredgroundMask=foreGroundMask, wallTypeMask=wallTypeMask, groundTypeMask=roadTypeMask, intrinsic=scaledIntrinsic, extrinsic=inputs['realEx'], semantic=inputs['seman_gt_eval'], cts_meta=inputs['cts_meta'], viewInd=index) # realDepth, foredgroundMask, wallTypeMask, groundTypeMask, intrinsic, extrinsic fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=index) fig_disp = tensor2disp(outputs[('disp', 0)], ind=index) fig_3d, veh_coord, veh_coord_gt = tensor23dPts.visualize3d( depthMap, ind=index, intrinsic=inputs['cts_meta']['intrinsic'][index, :, :], extrinsic=inputs['cts_meta']['extrinsic'][index, :, :], gtmask=inputs['cts_meta']['mask'][index, :, :], gtdepth=inputs['cts_meta']['depthMap'][index, :, :], semanticMap=inputs['seman_gt_eval'][index, :, :]) # check: # torch.inverse(inputs['invcamK'][index, :, :] @ inputs['realIn'][index, :, :]) - inputs['cts_meta']['extrinsic'][index, :, :] fig_grad = None if viewSurfaceNormal: # surnorm = compsn.visualize(depthMap = depthMap, invcamK = inputs['invcamK'].cuda(), orgEstPts = veh_coord, gtEstPts = veh_coord_gt, viewindex = index) surnorm = compsn.visualize( depthMap=depthMap, invcamK=inputs['invcamK'].cuda(), orgEstPts=veh_coord, gtEstPts=veh_coord_gt, viewindex=index) surnormMap = compsn(depthMap=depthMap, invcamK=inputs['invcamK'].cuda()) if viewMulReg: depthMapLoc = depthMap / STEREO_SCALE_FACTOR skyId = 10 skyMask = inputs['seman_gt'] == skyId skyerr = objReg.visualize_regularizeSky(depthMapLoc, skyMask, viewInd=index) wallType = [2, 3, 4] # Building, wall, fence roadType = [0, 1, 9] # road, sidewalk, terrain permuType = [5, 7] # Pole, traffic sign chanWinSize = 5 wallMask = torch.ones_like(skyMask) roadMask = torch.ones_like(skyMask) permuMask = torch.ones_like(skyMask) with torch.no_grad(): for m in wallType: wallMask = wallMask * (inputs['seman_gt'] != m) wallMask = 1 - wallMask wallMask = wallMask[:, :, 1:-1, 1:-1] for m in roadType: roadMask = roadMask * (inputs['seman_gt'] != m) roadMask = 1 - roadMask roadMask = roadMask[:, :, 1:-1, 1:-1] for m in permuType: permuMask = permuMask * (inputs['seman_gt'] != m) permuMask = 1 - permuMask permuMask = permuMask[:, :, 1:-1, 1:-1] BdErrFig, viewRdErrFig = objReg.visualize_regularizeBuildingRoad( surnormMap, wallMask, roadMask, dispMap, viewInd=index) padSize = int((chanWinSize - 1) / 2) permuMask = permuMask[:, :, padSize:-padSize, padSize:-padSize] surVarFig = objReg.visualize_regularizePoleSign( surnormMap, permuMask, dispMap, viewInd=index) if viewBorderRegress: foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [ 0, 1, 2, 3, 4, 8, 9, 10 ] # road, sidewalk, building, wall, fence, vegetation, terrain, sky suppressType = [255] # Suppress no label lines # foreGroundMask = torch.sum(inputs['seman_gt'][:, foregroundType, :, :], dim=1, keepdim=True) # backGroundMask = torch.sum(inputs['seman_gt'][:, backgroundType, :, :], dim=1, keepdim=True) foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() suppresMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in backgroundType: backGroundMask = backGroundMask * ( inputs['seman_gt'] != m) backGroundMask = 1 - backGroundMask for m in suppressType: suppresMask = suppresMask * (inputs['seman_gt'] != m) suppresMask = 1 - suppresMask suppresMask = suppresMask.float() combinedMask = torch.cat( [foreGroundMask, backGroundMask], dim=1).float() # borderRegFig = borderRegress.visualize_computeBorder(dispMap, combinedMask, suppresMask = suppresMask, viewIndex=index) borderRegFig = None else: borderRegFig = None # if viewBorderSimilarity: # foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, # 18] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle # backgroundType = [0, 1, 2, 3, 4, 8, 9, # 10] # road, sidewalk, building, wall, fence, vegetation, terrain, sky # suppressType = [255] # Suppress no label lines # foreGroundMask = torch.ones(dispMap.shape).cuda().byte() # backGroundMask = torch.ones(dispMap.shape).cuda().byte() # suppresMask = torch.ones(dispMap.shape).cuda().byte() # # with torch.no_grad(): # for m in foregroundType: # foreGroundMask = foreGroundMask * (inputs['seman_gt'] != m) # foreGroundMask = 1 - foreGroundMask # for m in backgroundType: # backGroundMask = backGroundMask * (inputs['seman_gt'] != m) # backGroundMask = 1 - backGroundMask # for m in suppressType: # suppresMask = suppresMask * (inputs['seman_gt'] != m) # suppresMask = 1 - suppresMask # suppresMask = suppresMask.float() # combinedMask = torch.cat([foreGroundMask, backGroundMask], dim=1).float() # # borderSimFig = borderSim.visualize_borderSimilarity(dispMap, foreGroundMask.float(), suppresMask = suppresMask, viewIndex=index) if viewRandomSample: foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [ 0, 1, 2, 3, 4, 8, 9, 10 ] # road, sidewalk, building, wall, fence, vegetation, terrain, sky suppressType = [255] # Suppress no label lines foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() suppresMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * ( inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in suppressType: suppresMask = suppresMask * (inputs['seman_gt'] != m) suppresMask = 1 - suppresMask suppresMask = suppresMask.float() foreGroundMask = foreGroundMask.float() rdSampleOnBorder.visualize_randomSample(dispMap, foreGroundMask, suppresMask, viewIndex=index) # rdSampleOnBorder.randomSampleReg(dispMap, foreGroundMask) if viewEdgeMerge: grad_disp = comp1dgrad(outputs[('mul_disp', 0)]) fig_grad = tensor2disp(grad_disp, ind=index, vmax=1) fig_grad = fig_grad.resize([512, 256]) if viewSelfOcclu: fl = inputs[("K", 0)][:, 0, 0] bs = torch.abs(inputs["stereo_T"][:, 0, 3]) clufig, suppressedDisp = selfclu.visualize(dispMap, viewind=index) if fig_grad is not None: grad_seman = ( np.array(fig_grad)[:, :, 0:3].astype(np.float) * 0.7 + np.array(fig_seman).astype(np.float) * 0.3).astype( np.uint8) # combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_grad)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)] combined = [ grad_seman, np.array(fig_disp)[:, :, 0:3], np.array(fig_rgb) ] combined = np.concatenate(combined, axis=1) else: if viewSurfaceNormal and viewSelfOcclu: surnorm = surnorm.resize([512, 256]) surnorm_mixed = pil.fromarray( (np.array(surnorm) * 0.2 + np.array(fig_disp)[:, :, 0:3] * 0.8).astype( np.uint8)) disp_seman = ( np.array(fig_disp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) supprressed_disp_seman = ( np.array(suppressedDisp)[:, :, 0:3].astype( np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) rgb_seman = ( np.array(fig_seman).astype(np.float) * 0.5 + np.array(fig_rgb).astype(np.float) * 0.5).astype( np.uint8) # clud_disp = (np.array(clufig)[:, :, 0:3].astype(np.float) * 0.3 + np.array(fig_disp)[:, :, 0:3].astype( # np.float) * 0.7).astype(np.uint8) comb1 = np.concatenate([ np.array(supprressed_disp_seman)[:, :, 0:3], np.array(suppressedDisp)[:, :, 0:3] ], axis=1) comb2 = np.concatenate([ np.array(disp_seman)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3] ], axis=1) comb3 = np.concatenate([ np.array(surnorm_mixed)[:, :, 0:3], np.array(surnorm)[:, :, 0:3] ], axis=1) comb4 = np.concatenate([ np.array(fig_seman)[:, :, 0:3], np.array(rgb_seman)[:, :, 0:3] ], axis=1) comb6 = np.concatenate([ np.array(clufig)[:, :, 0:3], np.array(fig_dispup)[:, :, 0:3] ], axis=1) fig3dsize = np.ceil( np.array([ comb4.shape[1], comb4.shape[1] / fig_3d.size[0] * fig_3d.size[1] ])).astype(np.int) comb5 = np.array(fig_3d.resize(fig3dsize)) # combined = np.concatenate([comb1, comb6, comb2, comb3, comb4, comb5], axis=0) combined = np.concatenate([comb1, comb2, comb4, comb3], axis=0) else: disp_seman = ( np.array(fig_disp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype(np.float) * 0.2).astype( np.uint8) rgb_seman = ( np.array(fig_seman).astype(np.float) * 0.5 + np.array(fig_rgb).astype(np.float) * 0.5).astype( np.uint8) # combined = [np.array(disp_seman)[:,:,0:3], np.array(fig_disp)[:, :, 0:3], np.array(fig_seman), np.array(fig_rgb)] combined = [ np.array(disp_seman)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3], np.array(fig_seman), np.array(rgb_seman) ] combined = np.concatenate(combined, axis=1) fig = pil.fromarray(combined) # fig.show() fig.save(os.path.join(dirpath, str(idx) + '.png')) if borderRegFig is not None: borderRegFig.save( os.path.join(dirpath, str(idx) + '_borderRegress.png')) # fig_3d.save(os.path.join(dirpath, str(idx) + '_fig3d.png')) # for k in range(10): # fig_disp = tensor2disp(outputs[('disp', 0)], ind=k) # fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=k) # combined = [np.array(fig_disp)[:, :, 0:3], np.array(fig_rgb)] # combined = np.concatenate(combined, axis=1) # fig = pil.fromarray(combined) # fig.save( # os.path.join('/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/MoredispOrg' + str(k) + '.png')) # fig_rgb.save(os.path.join(svRoot, app, 'rgb' + str(idx) + '.png')) # fig_seman.save(os.path.join(svRoot, app, 'semantic'+ str(idx) + '.png')) # fig_disp.save(os.path.join(svRoot, app, 'disp'+ str(idx) + '.png')) # a = inputs['seman_gt_eval'] # scaled_disp, _ = disp_to_depth(outputs[('disp', 0)], 0.1, 100) print("%dth saved" % idx) # If compute the histogram if isHist: svPath = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/mul_channel_depth' carId = 13 prob = copy.deepcopy(rec) ind = np.arange(prob.shape[1] * 2) for i in range(prob.shape[0]): prob[i, :] = prob[i, :] / np.sum(prob[i, :]) for i in range(prob.shape[0]): trainStr = trainId2label[i][0] fig, ax = plt.subplots() rects1 = ax.bar(ind[0::2], prob[carId, :], label='obj:car') rects2 = ax.bar(ind[1::2], prob[i, :], label='obj:' + trainStr) ax.set_ylabel('Meter in percentile') ax.set_xlabel('Meters') ax.set_title('Scale Changes between scale car and scale %s' % trainStr) ax.legend() plt.savefig(os.path.join(svPath, str(i)), dpi=200) plt.close(fig)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ is_use_disparity = True is_eval_morph = True is_cts_bst = True MIN_DEPTH = 1e-3 MAX_DEPTH = 80 if is_use_disparity: getDisp = get_disparity_predict() opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, tag=opt.dataset) elif opt.dataset == 'kitti': dataset = datasets.KITTISemanticDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, tag=opt.dataset) train_dataset_predict = datasets.KITTIRAWDataset( opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0,'s'], 4, tag='kitti', is_train=False, img_ext='png', load_meta=False, is_load_semantics=True, is_predicted_semantics=True, load_morphed_depth=False) train_dataset_gt = datasets.KITTIRAWDataset( opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0,'s'], 4, tag='kitti', is_train=False, img_ext='png', load_meta=False, is_load_semantics=True, is_predicted_semantics=False, load_morphed_depth=False) else: raise ValueError("No predefined dataset") dataloader_predict = DataLoader(train_dataset_predict, 1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) dataloader_gt = DataLoader(train_dataset_gt, 1, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) dataloader_predict_iter = iter(dataloader_predict) dataloader_gt_iter = iter(dataloader_gt) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() sfx = torch.nn.Softmax(dim=1) depth_pos = '/media/shengjie/other/sceneUnderstanding/bts/result_bts_eigen/raw' print("Evaluation starts") width = 1216 height = 352 height_s = int(0.40810811 * height) height_e = int(0.99189189 * height) width_s = int(0.03594771 * width) width_e = int(0.96405229 * width) if not is_use_disparity: ms = Morph_semantics(height=206, width=1129) else: ms = Morph_semantics(height=218, width=1153) with torch.no_grad(): for idx in range(dataloader_gt.__len__()): inputs_predict = dataloader_predict_iter.__next__() inputs_gt = dataloader_gt_iter.__next__() if not is_cts_bst: inputs_predict['seman_gt_eval'] = inputs_predict['seman_gt_eval'] else: tcomp = filenames[idx].split(' ') path = os.path.join('/media/shengjie/other/sceneUnderstanding/SDNET/cts_best_seman', tcomp[0].split('/')[0] +'_' + tcomp[0].split('/')[1] + '_' + tcomp[1].zfill(10) + '.png') cts_pred = Image.open(path) cts_pred = np.array(cts_pred) for k in np.unique(cts_pred): cts_pred[cts_pred == k] = labels[k].trainId inputs_predict['seman_gt_eval'] = torch.from_numpy(cts_pred).unsqueeze(0) # tensor2semantic(inputs_predict['seman_gt_eval'].unsqueeze(1), ind=0).show() # tensor2semantic(inputs_gt['seman_gt_eval'].unsqueeze(1), ind=0).show() # tensor2semantic(inputs_predict['seman_gt_eval'].unsqueeze(1), ind=0).show() # input_color = inputs[("color", 0, 0)].cuda() # outputs = depth_decoder(encoder(input_color),computeSemantic = True, computeDepth = False) resized_gt = inputs_gt['seman_gt_eval'].unsqueeze(1) # resized_gt = F.interpolate(inputs_gt['seman_gt_eval'].unsqueeze(1).float(), [height, width], mode='nearest') # resized_gt = resized_gt.squeeze(1).byte() resized_pred = F.interpolate(inputs_predict['seman_gt_eval'].unsqueeze(1).float(), [inputs_gt['seman_gt_eval'].shape[1], inputs_gt['seman_gt_eval'].shape[2]], mode='nearest') resized_pred = resized_pred.byte() resized_rgb = F.interpolate(inputs_gt[('color', 0, 0)], [inputs_gt['seman_gt_eval'].shape[1], inputs_gt['seman_gt_eval'].shape[2]], mode='bilinear', align_corners=True) resized_pred_list = list() resized_morph_list = list() groundTruthNp_list = list() if not is_use_disparity: t_height = resized_gt.shape[2] t_width = resized_gt.shape[3] top_margin = int(t_height - 352) left_margin = int((t_width - 1216) / 2) resized_gt = resized_gt[:,:,top_margin:top_margin + 352, left_margin:left_margin + 1216] resized_pred = resized_pred[:,:,top_margin:top_margin + 352, left_margin:left_margin + 1216] # tensor2semantic(resized_gt, ind=0).show() # tensor2semantic(resized_pred, ind=0).show() resized_rgb = F.interpolate(inputs_gt[('color', 0, 0)], [inputs_gt['seman_gt_eval'].shape[1], inputs_gt['seman_gt_eval'].shape[2]], mode='bilinear', align_corners=True) resized_rgb = resized_rgb[:,:,top_margin:top_margin + 352, left_margin:left_margin + 1216] pred_depth = get_depth_predict(filenames[idx]) resized_depth = pred_depth # resized_gt = resized_gt.cpu().numpy().astype(np.uint8) # resized_pred = resized_pred.cpu().numpy().astype(np.uint8) # resized_depth = pred_depth # visualize_semantic(gt[0,:,:]).show() # visualize_semantic(pred[0,:,:]).show() # pred_depth = get_depth_predict(filenames[idx]) # pred_depth = F.interpolate(pred_depth.float(), [height, width], mode='bilinear', align_corners=True) # resized_pred = resized_pred.unsqueeze(1) # resized_gt = resized_gt.unsqueeze(1) # tensor2semantic(resized_pred, ind=0).show() # tensor2semantic(resized_gt, ind=0).show() # tensor2disp(1 / pred_depth, vmax=0.15, ind=0).show() # disp_map = tensor2disp(1 / pred_depth, vmax=0.15, ind=0) # disp_map_combined = combined_2_img(disp_map, tensor2rgb(resized_rgb, ind=0), 0.5) pred_depth_cropped = resized_depth[:,:,height_s : height_e, width_s : width_e] resized_pred_cropped = resized_pred[:,:,height_s : height_e, width_s : width_e] resized_gt_cropped = resized_gt[:,:,height_s : height_e, width_s : width_e] resized_rgb_cropped = resized_rgb[:,:,height_s : height_e, width_s : width_e] # tensor2semantic(resized_pred_cropped, ind=0).show() # tensor2semantic(resized_gt_cropped, ind=0).show() # tensor2disp(1 / pred_depth_cropped, vmax=0.15, ind=0).show() figseman_gt = tensor2semantic(resized_gt_cropped, ind=0) figseman_pred = tensor2semantic(resized_pred_cropped, ind=0) figdisp = tensor2disp(1 / pred_depth_cropped, vmax=0.15, ind=0) combined_2_img(figseman_pred, figdisp, 0.7).show() combined_2_img(figseman_gt, figdisp, 0.7).show() seman_morphed = ms.morh_semantics(pred_depth_cropped, resized_pred_cropped) else: pred_depth = getDisp.read_disparity_predict(filenames[idx]) pred_depth = torch.from_numpy(pred_depth).unsqueeze(0).unsqueeze(0) pred_depth = F.interpolate(pred_depth, [inputs_gt['seman_gt_eval'].shape[1], inputs_gt['seman_gt_eval'].shape[2]], mode='bilinear', align_corners=True) # tensor2disp(pred_depth, ind=0, percentile=95).show() if pred_depth.shape[2] < 371 or pred_depth.shape[3] < 1197: print("Error") pred_depth_cropped = pred_depth[:, :, 153:371, 44:1197] resized_pred_cropped = resized_pred[:, :, 153:371, 44:1197] resized_gt_cropped = resized_gt[:, :, 153:371, 44:1197] resized_rgb_cropped = resized_rgb[:, :, 153:371, 44:1197] # figdisp = tensor2disp(pred_depth_cropped, percentile=95, ind=0) # figseman = tensor2semantic(resized_gt_cropped, ind=0) # figcombined = combined_2_img(figdisp, figseman, 0.7) # figcombined.show() # # figdisp = tensor2disp(pred_depth_cropped, percentile=95, ind=0) # figseman = tensor2semantic(resized_pred_cropped, ind=0) # figcombined = combined_2_img(figdisp, figseman, 0.7) # figcombined.show() seman_morphed = ms.morh_semantics(pred_depth_cropped, resized_pred_cropped) ms.compute_edge_distance(pred_depth_cropped, resized_pred_cropped, resized_gt_cropped) resized_pred_list.append(resized_pred_cropped.squeeze(1).detach().cpu().numpy()) resized_morph_list.append(seman_morphed.squeeze(1).detach().cpu().numpy().astype(np.uint8)) groundTruthNp_list.append(resized_gt_cropped.squeeze(1).detach().cpu().numpy()) sv_path = '/media/shengjie/other/sceneUnderstanding/SDNET/visualization/semantic_morph' gt_blended = combined_2_img(tensor2semantic(resized_gt_cropped, ind=0), tensor2rgb(resized_rgb_cropped, ind=0), 0.2) pred_blended = combined_2_img(tensor2semantic(resized_pred_cropped, ind=0), tensor2rgb(resized_rgb_cropped, ind=0), 0.2) morph_blended = combined_2_img(tensor2semantic(seman_morphed, ind=0), tensor2rgb(resized_rgb_cropped, ind=0), 0.2) improved_region = (seman_morphed.cuda().byte() == resized_gt_cropped.cuda().byte()) > (resized_pred_cropped.cuda().byte() == resized_gt_cropped.cuda().byte()) deterized_region = (seman_morphed.cuda().byte() == resized_gt_cropped.cuda().byte()) < ( resized_pred_cropped.cuda().byte() == resized_gt_cropped.cuda().byte()) improve_blend = combined_2_img(tensor2disp(improved_region, vmax = 1, ind=0), tensor2rgb(resized_rgb_cropped, ind=0), 0.6) deterized_blend = combined_2_img(tensor2disp(deterized_region, vmax = 1, ind=0), tensor2rgb(resized_rgb_cropped, ind=0), 0.6) cat_img = concat_imgs([gt_blended, pred_blended, morph_blended, improve_blend, deterized_blend]) cat_img.save(os.path.join('/media/shengjie/other/sceneUnderstanding/SDNET/visualization/semantic_morph', str(idx) + '.png')) # groundTruthNp = resized_gt_cropped.squeeze(1).detach().cpu().numpy() # if is_eval_morph: # predictionNp = seman_morphed.byte().squeeze(1).detach().cpu().numpy() # else: # predictionNp = resized_pred_cropped.squeeze(1).detach().cpu().numpy() print("Finish %dth batch" % idx) ms.show_dis_comp() for pp in range(2): nbPixels = 0 count255 = 0 confMatrix = generateMatrix(args) for k in range(len(resized_pred_list)): groundTruthNp = groundTruthNp_list[k] if pp == 0: predictionNp = resized_pred_list[k] else: predictionNp = resized_morph_list[k] nbPixels = nbPixels + groundTruthNp.shape[0] * groundTruthNp.shape[1] * groundTruthNp.shape[2] encoding_value = 256 # precomputed encoded = (groundTruthNp.astype(np.int32) * encoding_value) + predictionNp values, cnt = np.unique(encoded, return_counts=True) for value, c in zip(values, cnt): pred_id = value % encoding_value gt_id = int((value - pred_id) / encoding_value) if pred_id == 255 or gt_id == 255: count255 = count255 + c continue if not gt_id in args.evalLabels: printError("Unknown label with id {:}".format(gt_id)) confMatrix[gt_id][pred_id] += c if confMatrix.sum() + count255!= nbPixels: printError( 'Number of analyzed pixels and entries in confusion matrix disagree: contMatrix {}, pixels {}'.format( confMatrix.sum(), nbPixels)) classScoreList = {} for label in args.evalLabels: labelName = trainId2label[label].name classScoreList[labelName] = getIouScoreForLabel(label, confMatrix, args) vals = np.array(list(classScoreList.values())) print(vals) mIOU = np.mean(vals[np.logical_not(np.isnan(vals))]) if pp == 0: print("Original mIOU is %f" % mIOU) else: print("Morphed mIOU is %f" % mIOU)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0], 4, is_train=False, tag=opt.dataset) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset( opt.data_path, filenames, encoder_dict['height'], encoder_dict['width'], [0,'s'], 4, tag='kitti', is_train=False, img_ext='png', load_meta=False, is_load_semantics=True, is_predicted_semantics=True, load_morphed_depth=False) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, 16, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=False) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() sfx = torch.nn.Softmax(dim=1) print("Evaluation starts") confMatrix = generateMatrix(args) nbPixels = 0 count255 = 0 with torch.no_grad(): for idx, inputs in enumerate(dataloader): input_color = inputs[("color", 0, 0)].cuda() outputs = depth_decoder(encoder(input_color),computeSemantic = True, computeDepth = False) gt = inputs['seman_gt_eval'].cpu().numpy().astype(np.uint8) pred = sfx(outputs[('seman', 0)]).detach() pred = torch.argmax(pred, dim=1).type(torch.float).unsqueeze(1) pred = F.interpolate(pred, [gt.shape[1], gt.shape[2]], mode='nearest') pred = pred.squeeze(1).cpu().numpy().astype(np.uint8) # visualize_semantic(gt[0,:,:]).show() # visualize_semantic(pred[0,:,:]).show() groundTruthNp = gt predictionNp = pred nbPixels = nbPixels + groundTruthNp.shape[0] * groundTruthNp.shape[1] * groundTruthNp.shape[2] # encoding_value = max(groundTruthNp.max(), predictionNp.max()).astype(np.int32) + 1 encoding_value = 256 # precomputed encoded = (groundTruthNp.astype(np.int32) * encoding_value) + predictionNp values, cnt = np.unique(encoded, return_counts=True) for value, c in zip(values, cnt): pred_id = value % encoding_value gt_id = int((value - pred_id) / encoding_value) if pred_id == 255 or gt_id == 255: count255 = count255 + c continue if not gt_id in args.evalLabels: printError("Unknown label with id {:}".format(gt_id)) confMatrix[gt_id][pred_id] += c print("Finish %dth batch" % idx) if confMatrix.sum() + count255 != nbPixels: printError( 'Number of analyzed pixels and entries in confusion matrix disagree: contMatrix {}, pixels {}'.format( confMatrix.sum(), nbPixels)) classScoreList = {} for label in args.evalLabels: labelName = trainId2label[label].name classScoreList[labelName] = getIouScoreForLabel(label, confMatrix, args) vals = np.array(list(classScoreList.values())) mIOU = np.mean(vals[np.logical_not(np.isnan(vals))]) # if opt.save_pred_disps: # output_path = os.path.join( # opt.load_weights_folder, "disps_{}_split.npy".format(opt.eval_split)) # print("-> Saving predicted disparities to ", output_path) # np.save(output_path, pred_disps) print("mIOU is %f" % mIOU)
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 viewPythonVer = False viewCudaVer = True if viewCudaVer: bnmorph = BNMorph(height=opt.height, width=opt.width).cuda() opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) if opt.use_stereo: opt.frame_ids.append("s") if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset( opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, tag=opt.dataset, load_meta=True, direction_left=opt.direction_left) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset( opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, tag=opt.dataset, is_load_semantics=opt.use_kitti_gt_semantics, is_predicted_semantics=opt.is_predicted_semantics, direction_left=opt.direction_left) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) encoder = networks.ResnetEncoder(opt.num_layers, False, num_input_images=2) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder( encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel, outputtwoimage=(opt.outputtwoimage == True)) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict( {k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() viewIndex = 0 tool = grad_computation_tools(batch_size=opt.batch_size, height=opt.height, width=opt.width).cuda() auto_morph = AutoMorph(height=opt.height, width=opt.width) with torch.no_grad(): for idx, inputs in enumerate(dataloader): for key, ipt in inputs.items(): if not (key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta' or key == 'file_add'): inputs[key] = ipt.to(torch.device("cuda")) input_color = torch.cat( [inputs[("color_aug", 0, 0)], inputs[("color_aug", 's', 0)]], dim=1).cuda() # input_color = inputs[("color", 0, 0)].cuda() # tensor2rgb(inputs[("color_aug", 0, 0)], ind=0).show() # tensor2rgb(inputs[("color_aug", 's', 0)], ind=0).show() features = encoder(input_color) outputs = dict() outputs.update( depth_decoder(features, computeSemantic=True, computeDepth=False)) outputs.update( depth_decoder(features, computeSemantic=False, computeDepth=True)) if not opt.view_right: disparityMap = outputs[('mul_disp', 0)][:, 0:1, :, :] else: disparityMap = outputs[('mul_disp', 0)][:, 1:2, :, :] depthMap = torch.clamp(disparityMap, max=80) fig_seman = tensor2semantic(inputs['seman_gt'], ind=viewIndex, isGt=True) fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=viewIndex) fig_disp = tensor2disp(disparityMap, ind=viewIndex, vmax=0.1) segmentationMapGt = inputs['seman_gt'] foregroundType = [ 5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18 ] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle foregroundMapGt = torch.ones(disparityMap.shape).cuda().byte() for m in foregroundType: foregroundMapGt = foregroundMapGt * (segmentationMapGt != m) foregroundMapGt = (1 - foregroundMapGt).float() disparity_grad = torch.abs( tool.convDispx(disparityMap)) + torch.abs( tool.convDispy(disparityMap)) semantics_grad = torch.abs( tool.convDispx(foregroundMapGt)) + torch.abs( tool.convDispy(foregroundMapGt)) disparity_grad = disparity_grad * tool.zero_mask semantics_grad = semantics_grad * tool.zero_mask disparity_grad_bin = disparity_grad > tool.disparityTh semantics_grad_bin = semantics_grad > tool.semanticsTh # tensor2disp(disparity_grad_bin, ind=viewIndex, vmax=1).show() # tensor2disp(semantics_grad_bin, ind=viewIndex, vmax=1).show() if viewPythonVer: disparity_grad_bin = disparity_grad_bin.detach().cpu().numpy() semantics_grad_bin = semantics_grad_bin.detach().cpu().numpy() disparityMap_to_processed = disparityMap.detach().cpu().numpy( )[viewIndex, 0, :, :] dispMap_morphed, dispMap_morphRec = auto_morph.automorph( disparity_grad_bin[viewIndex, 0, :, :], semantics_grad_bin[viewIndex, 0, :, :], disparityMap_to_processed) fig_disp_processed = visualizeNpDisp(dispMap_morphed, vmax=0.1) overlay_processed = pil.fromarray( (np.array(fig_disp_processed) * 0.7 + np.array(fig_seman) * 0.3).astype(np.uint8)) overlay_org = pil.fromarray( (np.array(fig_disp) * 0.7 + np.array(fig_seman) * 0.3).astype(np.uint8)) combined_fig = pil.fromarray( np.concatenate([ np.array(overlay_org), np.array(overlay_processed), np.array(fig_disp), np.array(fig_disp_processed) ], axis=0)) combined_fig.save( "/media/shengjie/other/sceneUnderstanding/Stereo_SDNET/visualization/border_morph_l2_3/" + str(idx) + ".png") if viewCudaVer: # morphedx, morphedy = bnmorph.find_corresponding_pts(disparity_grad_bin, semantics_grad_bin, disparityMap, fig_seman, 10) # morphedx = (morphedx / (opt.width - 1) - 0.5) * 2 # morphedy = (morphedy / (opt.height - 1) - 0.5) * 2 # grid = torch.cat([morphedx, morphedy], dim = 1).permute(0,2,3,1) # disparityMap_morphed = F.grid_sample(disparityMap, grid, padding_mode="border") # fig_morphed = tensor2disp(disparityMap_morphed, vmax=0.08, ind=0) # fig_disp = tensor2disp(disparityMap, vmax=0.08, ind=0) # fig_combined = pil.fromarray(np.concatenate([np.array(fig_morphed), np.array(fig_disp)], axis=0)) # fig_combined.show() svpath = os.path.join(opt.load_weights_folder).split('/') try: svpath = os.path.join( "/media/shengjie/other/sceneUnderstanding/Stereo_SDNET/visualization", svpath[-3]) os.mkdir(svpath) except FileExistsError: a = 1 morphedx, morphedy, coeff = bnmorph.find_corresponding_pts( disparity_grad_bin, semantics_grad_bin) morphedx = (morphedx / (opt.width - 1) - 0.5) * 2 morphedy = (morphedy / (opt.height - 1) - 0.5) * 2 grid = torch.cat([morphedx, morphedy], dim=1).permute(0, 2, 3, 1) disparityMap_morphed = F.grid_sample(disparityMap, grid, padding_mode="border") fig_morphed = tensor2disp(disparityMap_morphed, vmax=0.08, ind=0) fig_disp = tensor2disp(disparityMap, vmax=0.08, ind=0) fig_morphed_overlayed = pil.fromarray( (np.array(fig_seman) * 0.5 + np.array(fig_morphed) * 0.5).astype(np.uint8)) fig_disp_overlayed = pil.fromarray( (np.array(fig_seman) * 0.5 + np.array(fig_disp) * 0.5).astype(np.uint8)) # fig_rgb = tensor2rgb(inputs[("color", 0, 0)], ind=0) # fig_combined = pil.fromarray(np.concatenate([np.array(fig_disp_overlayed), np.array(fig_morphed_overlayed), np.array(fig_disp), np.array(fig_morphed), np.array(fig_rgb)], axis=0)) fig_combined = pil.fromarray( np.concatenate([ np.array(fig_disp_overlayed), np.array(fig_morphed_overlayed), np.array(fig_disp), np.array(fig_morphed) ], axis=0)) fig_combined.save(os.path.join(svpath, str(idx) + ".png"))
def evaluate(opt): """Evaluates a pretrained model using a specified test set """ MIN_DEPTH = 1e-3 MAX_DEPTH = 80 opt.load_weights_folder = os.path.expanduser(opt.load_weights_folder) assert os.path.isdir(opt.load_weights_folder), \ "Cannot find a folder at {}".format(opt.load_weights_folder) print("-> Loading weights from {}".format(opt.load_weights_folder)) filenames = readlines(os.path.join(splits_dir, opt.split, "val_files.txt")) encoder_path = os.path.join(opt.load_weights_folder, "encoder.pth") decoder_path = os.path.join(opt.load_weights_folder, "depth.pth") encoder_dict = torch.load(encoder_path) # encoder's record of height and weight are of less important now if opt.use_stereo: opt.frame_ids.append("s") if opt.dataset == 'cityscape': dataset = datasets.CITYSCAPERawDataset(opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, tag=opt.dataset, load_meta=True) elif opt.dataset == 'kitti': dataset = datasets.KITTIRAWDataset(opt.data_path, filenames, opt.height, opt.width, opt.frame_ids, 4, is_train=False, tag=opt.dataset, is_load_semantics=True) else: raise ValueError("No predefined dataset") dataloader = DataLoader(dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True, drop_last=True) encoder = networks.ResnetEncoder(opt.num_layers, False) if opt.switchMode == 'on': depth_decoder = networks.DepthDecoder(encoder.num_ch_enc, isSwitch=True, isMulChannel=opt.isMulChannel) else: depth_decoder = networks.DepthDecoder(encoder.num_ch_enc) model_dict = encoder.state_dict() encoder.load_state_dict({k: v for k, v in encoder_dict.items() if k in model_dict}) depth_decoder.load_state_dict(torch.load(decoder_path)) encoder.cuda() encoder.eval() depth_decoder.cuda() depth_decoder.eval() ##--------------------Visualization parameter here----------------------------## sfx = torch.nn.Softmax(dim=1) mergeDisp = Merge_MultDisp(opt.scales, batchSize = opt.batch_size, isMulChannel = opt.isMulChannel) svRoot = '/media/shengjie/other/sceneUnderstanding/monodepth2/internalRe/figure_visual' index = 0 isvisualize = True useGtSeman = True useSeman = False viewSurfaceNormal = False viewSelfOcclu = False viewMutuallyRegularizedBorder= False viewLiuSemanCompare = False viewSecondOrder = False viewBorderConverge = True expBin = True height = 288 width = 960 tensor23dPts = Tensor23dPts(height=height, width=width) dirpath = os.path.join(svRoot, opt.model_name) if not os.path.exists(dirpath): os.makedirs(dirpath) if viewSurfaceNormal: compsn = ComputeSurfaceNormal(height = height, width = width, batch_size = opt.batch_size).cuda() if viewSelfOcclu: selfclu = SelfOccluMask().cuda() if viewMutuallyRegularizedBorder: mrb = MutuallyRegularizedBorders(height=height, width=width, batchsize=opt.batch_size) iouFore_gtdepth2gtseman = list() iouBack_gtdepth2gtseman = list() iouValid_gtdepth2gtseman = list() iouFore_estdepth2gtseman = list() iouBack_estdepth2gtseman = list() iouValid_estdepth2gtseman = list() iouFore_estdepth2estseman = list() iouBack_estdepth2estseman = list() iouValid_estdepth2estseman = list() if viewLiuSemanCompare: cmpBCons = computeBorderDistance() compGrad = computeGradient() semanest2semangt = np.zeros(31) depth2disp = np.zeros(31) depth2semangt = np.zeros(31) disp2semanest = np.zeros(31) sfx = torch.nn.Softmax(dim=1) cmpBCons.cuda() compGrad.cuda() if viewSecondOrder: compSecGrad = SecondOrderGrad().cuda() if viewBorderConverge: borderConverge = BorderConverge(height, width, opt.batch_size).cuda() if expBin: expbinmap = expBinaryMap(height, width, opt.batch_size).cuda() computedNum = 0 # with torch.no_grad(): for idx, inputs in enumerate(dataloader): for key, ipt in inputs.items(): if not(key == 'height' or key == 'width' or key == 'tag' or key == 'cts_meta'): inputs[key] = ipt.to(torch.device("cuda")) input_color = inputs[("color", 0, 0)].cuda() features = encoder(input_color) outputs = dict() outputs.update(depth_decoder(features, computeSemantic=True, computeDepth=False)) outputs.update(depth_decoder(features, computeSemantic=False, computeDepth=True)) if isvisualize: if useGtSeman: mergeDisp(inputs, outputs, eval=False) else: mergeDisp(inputs, outputs, eval=True) dispMap = outputs[('disp', 0)] scaled_disp, depthMap = disp_to_depth(dispMap, 0.1, 100) depthMap = depthMap * STEREO_SCALE_FACTOR depthMap = torch.clamp(depthMap, max=80) if useGtSeman: fig_seman = tensor2semantic(inputs['seman_gt'], ind=index, isGt=True) else: if useSeman: fig_seman = tensor2semantic(outputs[('seman', 0)], ind=index) else: fig_seman = inputs[('color', 0, 0)][index, :, :, :].permute(1,2,0).cpu().numpy() fig_seman = (fig_seman * 255).astype(np.uint8) fig_seman = pil.fromarray(fig_seman) fig_rgb = tensor2rgb(inputs[('color', 0, 0)], ind=index) fig_disp = tensor2disp(outputs[('disp', 0)], ind=index, vmax=0.1) gtmask = (inputs['depth_gt'] > 0).float() gtdepth = inputs['depth_gt'] velo = inputs['velo'] fig_3d, veh_coord, veh_coord_gt = tensor23dPts.visualize3d(depthMap.detach(), ind=index, intrinsic_in=inputs['realIn'], extrinsic_in=inputs['realEx'], gtmask_in=gtmask, gtdepth_in=gtdepth, semanticMap=None, velo_in=velo, rgb_in = inputs[('color', 's', 0)], disp_in = outputs[('disp', 0)].detach() ) if viewMutuallyRegularizedBorder: foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [2, 3, 4, 8, 9, 10] #building, wall, fence, vegetation, terrain, sky foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * (inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in backgroundType: backGroundMask = backGroundMask * (inputs['seman_gt'] != m) backGroundMask = 1 - backGroundMask # tensor2disp(foreGroundMask, ind=0, vmax=1).show() # tensor2disp(backGroundMask, ind=0, vmax=1).show() # tensor2rgb(inputs[('color', 0, 0)], ind=0).show() # tensor2semantic(inputs['seman_gt'],ind=0,isGt=True).show() iouForeMean, iouBackMean, isvalid = mrb.visualization(gtdepth, foreGroundMask, backGroundMask, viewind= index, rgb=inputs[('color', 0, 0)]) iouFore_gtdepth2gtseman.append(iouForeMean) iouBack_gtdepth2gtseman.append(iouBackMean) iouValid_gtdepth2gtseman.append(isvalid) iouForeMean, iouBackMean, isvalid = mrb.visualization(1 - dispMap, foreGroundMask, backGroundMask, viewind=index, rgb=inputs[('color', 0, 0)]) iouFore_estdepth2gtseman.append(iouForeMean) iouBack_estdepth2gtseman.append(iouBackMean) iouValid_estdepth2gtseman.append(isvalid) semanMapEst = outputs[('seman', 0)] semanMapEst_sfxed = sfx(semanMapEst) foreGroundMask_est = torch.sum(semanMapEst_sfxed[:, foregroundType, :, :], dim=1).unsqueeze(1) backGroundMask_est = torch.sum(semanMapEst_sfxed[:, backgroundType, :, :], dim=1).unsqueeze(1) other_est = 1 - (foreGroundMask_est + backGroundMask_est) tot_est = torch.cat([foreGroundMask_est, backGroundMask_est, other_est], dim=1) foreGroundMask_est_bin = (torch.argmax(tot_est, dim=1) == 0).unsqueeze(1) backGroundMask_est_bin = (torch.argmax(tot_est, dim=1) == 1).unsqueeze(1) iouForeMean, iouBackMean, isvalid = mrb.visualization(1 - dispMap, foreGroundMask_est_bin, backGroundMask_est_bin, viewind=index, rgb=inputs[('color', 0, 0)]) iouFore_estdepth2estseman.append(iouForeMean) iouBack_estdepth2estseman.append(iouBackMean) iouValid_estdepth2estseman.append(isvalid) # tensor2disp(foreGroundMask_est_bin, vmax=1, ind=0).show() # tensor2disp(backGroundMask_est_bin, vmax=1, ind=0).show() if viewLiuSemanCompare: foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle backgroundType = [2, 3, 4, 8, 9, 10] #building, wall, fence, vegetation, terrain, sky foreGroundMask = torch.ones(dispMap.shape).cuda().byte() backGroundMask = torch.ones(dispMap.shape).cuda().byte() with torch.no_grad(): for m in foregroundType: foreGroundMask = foreGroundMask * (inputs['seman_gt'] != m) foreGroundMask = 1 - foreGroundMask for m in backgroundType: backGroundMask = backGroundMask * (inputs['seman_gt'] != m) backGroundMask = 1 - backGroundMask dispMapEst = outputs[('disp', 0)] semanMapEst = outputs[('seman', 0)] semanMapGt = inputs['seman_gt'] depthMapGt = inputs['depth_gt'] sparseDepthmapGrad = compGrad.computegrad11_sparse(depthMapGt) sparseDepthmapGrad_bin = sparseDepthmapGrad > 0 sparseDepthmapGrad = F.interpolate(sparseDepthmapGrad, [height, width], mode='bilinear', align_corners=True) sparseDepthmapGrad_bin = F.interpolate(sparseDepthmapGrad_bin.float(), [height, width], mode='nearest') sparseDepthmapGrad = sparseDepthmapGrad * sparseDepthmapGrad_bin # depthMapGt_bin = depthMapGt > 1e-1 # depthMapGt = F.interpolate(sparseDepthmapGrad, (height, width), mode='bilinear', align_corners=False) # depthMapGt_bin = F.interpolate(depthMapGt_bin.float(), (height, width), mode='nearest') # depthMapGt = depthMapGt * depthMapGt_bin # compGrad.computegrad11_sparse(depthMapGt) # tensor2disp(depthMapGt>0, ind=0, vmax=1).show() semanMapEst_sfxed = sfx(semanMapEst) semanMapEst_inds = torch.argmax(semanMapEst_sfxed, dim=1).unsqueeze(1) seman_est_fig = tensor2semantic(semanMapEst_inds, ind=0) seman_gt_fig = tensor2semantic(semanMapGt, ind=0) depthMapGt_fig = tensor2disp(depthMapGt, ind=0, vmax=20) depthMapGt_fig = depthMapGt_fig.resize((width, height), resample=pil.BILINEAR) foreGroundMask_est = torch.sum(semanMapEst_sfxed[:,foregroundType,:,:], dim=1).unsqueeze(1) dispMapGrad = compGrad.computegrad11(dispMapEst) foreGroundMaskGrad = compGrad.computegrad11(foreGroundMask.float()) foreGroundMask_estGrad = compGrad.computegrad11(foreGroundMask_est) sparseDepthmapGrad_fig = tensor2disp(sparseDepthmapGrad, ind=0, vmax=20) dispMapGrad_fig = tensor2disp(dispMapGrad, ind=0, vmax=0.08) foreGroundMaskGrad_fig = tensor2disp(foreGroundMaskGrad, ind=0, vmax=1) foreGroundMask_estGrad_fig = tensor2disp(foreGroundMask_estGrad, ind=0, vmax=1.5) dispMapGrad_bin = dispMapGrad > 0.011 foreGroundMaskGrad_bin = foreGroundMaskGrad > 0.5 foreGroundMask_estGrad_bin = foreGroundMask_estGrad > 0.6 sparseDepthmapGrad_bin = sparseDepthmapGrad > 9 dispMapGrad_bin_fig = tensor2disp(dispMapGrad_bin, ind=0, vmax=1) foreGroundMaskGrad_bin_fig = tensor2disp(foreGroundMaskGrad_bin, ind=0, vmax=1) foreGroundMask_estGrad_bin_fig = tensor2disp(foreGroundMask_estGrad_bin, ind=0, vmax=1) sparseDepthmapGrad_bin_fig = tensor2disp(sparseDepthmapGrad_bin, ind=0, vmax=1) visualizeImage = np.concatenate([np.array(fig_rgb), np.array(fig_disp)[:,:,0:3], np.array(seman_est_fig), np.array(seman_gt_fig), np.array(depthMapGt_fig)[:,:,0:3]], axis=0) visualizeImage_grad = np.concatenate([np.array(fig_rgb), np.array(dispMapGrad_fig)[:,:,0:3], np.array(foreGroundMask_estGrad_fig)[:,:,0:3], np.array(foreGroundMaskGrad_fig)[:,:,0:3], np.array(sparseDepthmapGrad_fig)[:,:,0:3]], axis=0) visualizeimage_grad_bin = np.concatenate([np.array(fig_rgb), np.array(dispMapGrad_bin_fig)[:,:,0:3], np.array(foreGroundMask_estGrad_bin_fig)[:,:,0:3], np.array(foreGroundMaskGrad_bin_fig)[:,:,0:3], np.array(sparseDepthmapGrad_bin_fig)[:,:,0:3]], axis=0) tot = np.concatenate([np.array(visualizeImage), np.array(visualizeImage_grad), np.array(visualizeimage_grad_bin)], axis=1) pil.fromarray(tot).save('/media/shengjie/other/sceneUnderstanding/SDNET/visualization/borderConsistAnalysis/%d.png' % idx) # pil.fromarray(tot).show() # pil.fromarray(visualizeImage).show() # pil.fromarray(visualizeImage_grad).show() # pil.fromarray(visualizeimage_grad_bin).show() semanest2semangt = semanest2semangt + cmpBCons.computeDistance(foreGroundMask_estGrad_bin, foreGroundMaskGrad_bin) depth2disp = depth2disp + cmpBCons.computeDistance(sparseDepthmapGrad_bin, dispMapGrad_bin) depth2semangt = depth2semangt + cmpBCons.computeDistance(sparseDepthmapGrad_bin, foreGroundMaskGrad_bin) disp2semanest = disp2semanest + cmpBCons.computeDistance(dispMapGrad_bin, foreGroundMask_estGrad_bin) # tensor2disp(dispMapEst, ind=index, percentile=90).show() if viewBorderConverge: semanMapEst = outputs[('seman', 0)] semanMapEst_sfxed = sfx(semanMapEst) foregroundType = [5, 6, 7, 11, 12, 13, 14, 15, 16, 17, 18] # pole, traffic light, traffic sign, person, rider, car, truck, bus, train, motorcycle, bicycle foreGroundMask_est = torch.sum(semanMapEst_sfxed[:, foregroundType, :, :], dim=1).unsqueeze(1) dispMapEst = outputs[('disp', 0)] # borderConverge.visualization(dispMapEst, foreGroundMask_est) if expBin: expbinmap.visualization3(disparity=dispMapEst, semantics=foreGroundMask_est) a = 1 if viewSecondOrder: disp2order = compSecGrad.computegrad11(outputs[('disp', 0)]) tensor2disp(disp2order, ind=0, percentile=95).show() if viewSurfaceNormal: surnorm = compsn.visualize(depthMap=depthMap, invcamK=inputs['invcamK'].cuda().float(), orgEstPts=veh_coord, gtEstPts=veh_coord_gt, viewindex=index) surnormMap = compsn(depthMap=depthMap, invcamK=inputs['invcamK'].cuda().float()) if viewSelfOcclu: fl = inputs[("K", 0)][:, 0, 0] bs = torch.abs(inputs["stereo_T"][:, 0, 3]) clufig, suppressedDisp = selfclu.visualize(dispMap, viewind=index) if viewSurfaceNormal and viewSelfOcclu: surnorm = surnorm.resize([width, height]) surnorm_mixed = pil.fromarray( (np.array(surnorm) * 0.2 + np.array(fig_disp)[:, :, 0:3] * 0.8).astype(np.uint8)) disp_seman = (np.array(fig_disp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype( np.float) * 0.2).astype(np.uint8) supprressed_disp_seman = (np.array(suppressedDisp)[:, :, 0:3].astype(np.float) * 0.8 + np.array(fig_seman).astype( np.float) * 0.2).astype(np.uint8) rgb_seman = (np.array(fig_seman).astype(np.float) * 0.5 + np.array(fig_rgb).astype( np.float) * 0.5).astype(np.uint8) # clud_disp = (np.array(clufig)[:, :, 0:3].astype(np.float) * 0.3 + np.array(fig_disp)[:, :, 0:3].astype( # np.float) * 0.7).astype(np.uint8) comb1 = np.concatenate([np.array(supprressed_disp_seman)[:, :, 0:3], np.array(suppressedDisp)[:, :, 0:3]], axis=1) comb2 = np.concatenate([np.array(disp_seman)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3]], axis=1) # comb3 = np.concatenate([np.array(errFig)[:, :, 0:3], np.array(surnorm)[:, :, 0:3]], axis=1) comb4 = np.concatenate([np.array(fig_seman)[:, :, 0:3], np.array(rgb_seman)[:, :, 0:3]], axis=1) comb6 = np.concatenate([np.array(clufig)[:, :, 0:3], np.array(fig_disp)[:, :, 0:3]], axis=1) fig3dsize = np.ceil(np.array([comb4.shape[1] , comb4.shape[1] / fig_3d.size[0] * fig_3d.size[1]])).astype(np.int) comb5 = np.array(fig_3d.resize(fig3dsize)) # fig = pil.fromarray(combined) # fig.save(os.path.join(dirpath, str(idx) + '.png')) print("%dth img finished" % idx) # if idx >=4: # break if viewLiuSemanCompare: semanest2semangt_p = semanest2semangt / np.sum(semanest2semangt) semanest2semangt_p_ = semanest2semangt_p[0:-1] mean = np.sum(np.arange(len(semanest2semangt_p_)) * semanest2semangt_p_) std = np.sqrt(np.sum((np.arange(len(semanest2semangt_p_)) - mean) ** 2 * semanest2semangt_p_)) fig, ax = plt.subplots() ax.bar(np.arange(len(semanest2semangt_p)), semanest2semangt_p) ax.set_ylabel('Percentile') ax.set_xlabel('Distance in pixel, mean %f, std %f' % (mean, std)) ax.set_title("Pixel distance of semantic, est to gt") fig.savefig("/media/shengjie/other/sceneUnderstanding/SDNET/visualization/borderConsistAnalysis/seman_est2gt.png") plt.close(fig) depth2disp_p = depth2disp / np.sum(depth2disp) depth2disp_p_ = depth2disp_p[0:-1] mean = np.sum(np.arange(len(depth2disp_p_)) * depth2disp_p_) std = np.sqrt(np.sum((np.arange(len(depth2disp_p_)) - mean) ** 2 * depth2disp_p_)) fig, ax = plt.subplots() ax.bar(np.arange(len(depth2disp_p)), depth2disp_p) ax.set_ylabel('Percentile') ax.set_xlabel('Distance in pixel, mean %f, std %f' % (mean, std)) ax.set_title("Pixel distance of depth, gt to est") fig.savefig("/media/shengjie/other/sceneUnderstanding/SDNET/visualization/borderConsistAnalysis/depth_gt2est.png") plt.close(fig) depth2semangt_p = depth2semangt / np.sum(depth2semangt) depth2semangt_p_ = depth2semangt_p[0:-1] mean = np.sum(np.arange(len(depth2semangt_p_)) * depth2semangt_p_) std = np.sqrt(np.sum((np.arange(len(depth2semangt_p_)) - mean) ** 2 * depth2semangt_p_)) fig, ax = plt.subplots() ax.bar(np.arange(len(depth2semangt_p)), depth2semangt_p) ax.set_ylabel('Percentile') ax.set_xlabel('Distance in pixel, mean %f, std %f' % (mean, std)) ax.set_title("Pixel distance of depth and semantic, gt") fig.savefig("/media/shengjie/other/sceneUnderstanding/SDNET/visualization/borderConsistAnalysis/depth2seman_gt.png") plt.close(fig) disp2semanest_p = disp2semanest / np.sum(disp2semanest) disp2semanest_p_ = disp2semanest_p[0:-1] mean = np.sum(np.arange(len(disp2semanest_p_)) * disp2semanest_p_) std = np.sqrt(np.sum((np.arange(len(disp2semanest_p_)) - mean) ** 2 * disp2semanest_p_)) fig, ax = plt.subplots() ax.bar(np.arange(len(disp2semanest_p)), disp2semanest_p) ax.set_ylabel('Percentile') ax.set_xlabel('Distance in pixel, mean %f, std %f' % (mean, std)) ax.set_title("Pixel distance of depth and semantic, est") fig.savefig("/media/shengjie/other/sceneUnderstanding/SDNET/visualization/borderConsistAnalysis/depth2seman_est.png") plt.close(fig) if viewMutuallyRegularizedBorder: iouFore_gtdepth2gtseman = np.array(iouFore_gtdepth2gtseman) iouBack_gtdepth2gtseman = np.array(iouBack_gtdepth2gtseman) iouValid_gtdepth2gtseman = np.array(iouValid_gtdepth2gtseman) iouFore_gtdepth2gtsemanMean = np.sum(iouFore_gtdepth2gtseman * iouValid_gtdepth2gtseman) / np.sum(iouValid_gtdepth2gtseman) iouBack_gtdepth2gtsemanMean = np.sum(iouBack_gtdepth2gtseman * iouValid_gtdepth2gtseman) / np.sum(iouValid_gtdepth2gtseman) iouFore_estdepth2gtseman = np.array(iouFore_estdepth2gtseman) iouBack_estdepth2gtseman = np.array(iouBack_estdepth2gtseman) iouValid_estdepth2gtseman = np.array(iouValid_estdepth2gtseman) iouFore_estdepth2gtsemanMean = np.sum(iouFore_estdepth2gtseman * iouValid_estdepth2gtseman) / np.sum(iouValid_estdepth2gtseman) iouBack_estdepth2gtsemanMean = np.sum(iouBack_estdepth2gtseman * iouValid_estdepth2gtseman) / np.sum(iouValid_estdepth2gtseman) iouFore_estdepth2estseman = np.array(iouFore_estdepth2estseman) iouBack_estdepth2estseman = np.array(iouBack_estdepth2estseman) iouValid_estdepth2estseman = np.array(iouValid_estdepth2estseman) iouFore_estdepth2estsemanMean = np.sum(iouFore_estdepth2estseman * iouValid_estdepth2estseman) / np.sum(iouValid_estdepth2estseman) iouBack_estdepth2estsemanMean = np.sum(iouBack_estdepth2estseman * iouValid_estdepth2estseman) / np.sum(iouValid_estdepth2estseman) print("iouFore_gtdepth2gtsemanMean is % f" % iouFore_gtdepth2gtsemanMean) print("iouBack_gtdepth2gtsemanMean is % f" % iouBack_gtdepth2gtsemanMean) print("iouFore_estdepth2gtsemanMean is % f" % iouFore_estdepth2gtsemanMean) print("iouBack_estdepth2gtsemanMean is % f" % iouBack_estdepth2gtsemanMean) print("iouFore_estdepth2estsemanMean is % f" % iouFore_estdepth2estsemanMean) print("iouBack_estdepth2estsemanMean is % f" % iouBack_estdepth2estsemanMean)