def forward(self, x): x = self.pad2d(x) in_channels, h, w = x.shape[1], x.shape[2], x.shape[3] pooled_features, idx = self.max_pool(x[:, :-5, ...]) num_of_win = pooled_features.shape[2] * pooled_features.shape[3] # ********** Shape aggregation ********** max_unpooled = self.max_unpool(pooled_features, idx, output_size=(h, w)) # The division a heuristic taken from the Attention paper. The idea is to avoid the extremes of the softmax. It should # be experimented with. I think it has to do with random walks. shape_weights = torch.norm( max_unpooled, p=1, dim=1, keepdim=True) / torch.tensor( in_channels - 5, requires_grad=False).type( torch.FloatTensor).sqrt() # shape_weights = torch.norm(x, p=2, dim=1, keepdim=True) / torch.tensor(in_channels - 5, requires_grad=False).type(torch.FloatTensor).sqrt() # shape_weights = max_unpooled.sum(dim=1, keepdim=True) / torch.tensor(in_channels - 5, requires_grad=False).type( # torch.FloatTensor) # .sqrt() # window_shape_query = functional.softmax(unfold(shape_weights, (self.k, self.k), padding=0, stride=self.stride), # dim=1).unsqueeze(1) shape_windows = unfold(shape_weights, (self.k, self.k), padding=0, stride=self.stride) window_shape_query = ( shape_windows / torch.norm(shape_windows, p=1, dim=1, keepdim=True)).unsqueeze(1) # window_shape_query = unfold(shape_weights, (self.k, self.k), padding=0, stride=self.stride).unsqueeze(1) # Computing window means window_means = torch.sum( unfold(x[:, -5:-3, ...], (self.k, self.k), stride=self.stride).view( -1, 2, self.k * self.k, num_of_win) * window_shape_query, dim=2) # Part_1 is contribution of variances of ellipses. Part_2 is the contribution of how far the mean of each ellipse is # from the mean of the window. window_var_part_1 = torch.sum( unfold(x[:, -3:-1, ...], (self.k, self.k), stride=self.stride).view( -1, 2, self.k * self.k, num_of_win) * window_shape_query, dim=2) window_var_part_2 = (( (unfold(x[:, -5:-3, ...], (self.k, self.k), stride=self.stride).view( -1, 2, self.k * self.k, num_of_win) - window_means.unsqueeze(2))**2) * window_shape_query).sum(dim=2) window_var = window_var_part_1 + window_var_part_2 # Part_1 is contribution of covariances of ellipses. Part_2 is the contribution of how far the mean of each ellipse is # from the mean of the window. window_covar_part_1 = torch.sum( unfold(x[:, -1:, ...], (self.k, self.k), stride=self.stride).view( -1, 1, self.k * self.k, num_of_win) * window_shape_query, dim=2) window_covar_part_2 = ( ((unfold(x[:, -5:-4, ...], (self.k, self.k), stride=self.stride).view( -1, 1, self.k * self.k, num_of_win) - window_means[:, 0:1, :].unsqueeze(2)) * (unfold(x[:, -4:-3, ...], (self.k, self.k), stride=self.stride).view( -1, 1, self.k * self.k, num_of_win) - window_means[:, 1:2, :].unsqueeze(2))) * window_shape_query).sum(dim=2) window_covar = window_covar_part_1 + window_covar_part_2 window_aggregated_shapes = torch.cat( [window_means, window_var, window_covar], dim=1) window_aggregated_shapes = fold( window_aggregated_shapes, (pooled_features.shape[2], pooled_features.shape[3]), (1, 1)) # plot_shapes(window_aggregated_shapes, idx=0) output = torch.cat([pooled_features, window_aggregated_shapes], dim=1) return output
def forward(self, x): """Applies network layers and ops on input image(s) x. Args: x: input image or batch of images. Shape: [batch,3,300,300]. Return: Depending on phase: test: Variable(tensor) of output class label predictions, confidence score, and corresponding location predictions for each object detected. Shape: [batch,topk,7] train: list of concat outputs from: 1: confidence layers, Shape: [batch*num_priors,num_classes] 2: localization layers, Shape: [batch,num_priors*4] 3: priorbox layers, Shape: [2,num_priors*4] """ size = x.size()[2:] batch_size = x.shape[0] sources = list() loc = list() conf = list() x = self.conv_top(x) s = self.L2Norm3_3(x) sources.append(s) patches = self.unfold(x) patches = torch.cat(torch.unbind(patches, dim=2), dim=0) patches = torch.reshape(patches, (-1, 4, 8, 8)) output_x = int((x.shape[2] - 8) / 4 + 1) output_y = int((x.shape[3] - 8) / 4 + 1) rnnX = self.rnn_model(patches, int(batch_size) * output_x * output_y) x = torch.stack(torch.split(rnnX, split_size_or_sections=int(batch_size), dim=0), dim=2) x = F.fold(x, kernel_size=(1, 1), output_size=(output_x, output_y)) x = F.pad(x, (0, 1, 0, 1), mode='replicate') for k in range(4): x = self.mob[k](x) s = self.L2Norm4_3(x) sources.append(s) for k in range(4, 8): x = self.mob[k](x) s = self.L2Norm5_3(x) sources.append(s) for k in range(8, 10): x = self.mob[k](x) sources.append(x) for k in range(10, 11): x = self.mob[k](x) sources.append(x) for k in range(11, 12): x = self.mob[k](x) sources.append(x) # apply multibox head to source layers loc_x = self.loc[0](sources[0]) conf_x = self.conf[0](sources[0]) loc_x = self.loc[1](loc_x) conf_x = self.conf[1](conf_x) max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) for i in range(1, len(sources)): x = sources[i] conf.append(self.conf[i + 1](x).permute(0, 2, 3, 1).contiguous()) loc.append(self.loc[i + 1](x).permute(0, 2, 3, 1).contiguous()) features_maps = [] for i in range(len(loc)): feat = [] feat += [loc[i].size(1), loc[i].size(2)] features_maps += [feat] self.priorbox = PriorBox(size, features_maps, cfg) self.priors = self.priorbox.forward() loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) if self.phase == 'test': output = detect_function( cfg, loc.view(loc.size(0), -1, 4), # loc preds self.softmax(conf.view(conf.size(0), -1, self.num_classes)), # conf preds self.priors.type(type(x.data)) # default boxes ) else: output = (loc.view(loc.size(0), -1, 4), conf.view(conf.size(0), -1, self.num_classes), self.priors) return output, loc, conf
def __getitem__(self, index): # get item in tensor shape scan_file = self.scan_files[index] if self.gt: label_file = self.label_files[index] # open a semantic laserscan if self.gt: scan = SemLaserScan(self.color_map, project=True, H=self.sensor_img_H, W=self.sensor_img_W, fov_up=self.sensor_fov_up, fov_down=self.sensor_fov_down) else: scan = LaserScan(project=True, H=self.sensor_img_H, W=self.sensor_img_W, fov_up=self.sensor_fov_up, fov_down=self.sensor_fov_down) # open and obtain scan scan.open_scan(scan_file) if self.gt: scan.open_label(label_file) # map unused classes to used classes (also for projection) scan.sem_label = self.map(scan.sem_label, self.learning_map) scan.proj_sem_label = self.map(scan.proj_sem_label, self.learning_map) # make a tensor of the uncompressed data (with the max num points) unproj_n_points = scan.points.shape[0] unproj_xyz = torch.full((self.max_points, 3), -1.0, dtype=torch.float) unproj_xyz[:unproj_n_points] = torch.from_numpy(scan.points) unproj_range = torch.full([self.max_points], -1.0, dtype=torch.float) unproj_range[:unproj_n_points] = torch.from_numpy(scan.unproj_range) unproj_remissions = torch.full([self.max_points], -1.0, dtype=torch.float) unproj_remissions[:unproj_n_points] = torch.from_numpy(scan.remissions) if self.gt: unproj_labels = torch.full([self.max_points], -1.0, dtype=torch.int32) unproj_labels[:unproj_n_points] = torch.from_numpy(scan.sem_label) else: unproj_labels = [] # get points and labels proj_range = torch.from_numpy(scan.proj_range).clone() proj_xyz = torch.from_numpy(scan.proj_xyz).clone() proj_remission = torch.from_numpy(scan.proj_remission).clone() proj_mask = torch.from_numpy(scan.proj_mask) if self.gt: proj_labels = torch.from_numpy(scan.proj_sem_label).clone() proj_labels = proj_labels * proj_mask else: proj_labels = [] proj_x = torch.full([self.max_points], -1, dtype=torch.long) proj_x[:unproj_n_points] = torch.from_numpy(scan.proj_x) proj_y = torch.full([self.max_points], -1, dtype=torch.long) proj_y[:unproj_n_points] = torch.from_numpy(scan.proj_y) proj = torch.cat([ proj_range.unsqueeze(0).clone(), proj_xyz.clone().permute(2, 0, 1), proj_remission.unsqueeze(0).clone() ]) proj_blocked = proj.unsqueeze(1) # Swap Batch and channel dimensions proj = (proj - self.sensor_img_means[:, None, None] ) / self.sensor_img_stds[:, None, None] proj = proj * proj_mask.float() # get name and sequence path_norm = os.path.normpath(scan_file) path_split = path_norm.split(os.sep) path_seq = path_split[-3] path_name = path_split[-1].replace(".bin", ".label") # print("path_norm: ", path_norm) # print("path_seq", path_seq) # print("path_name", path_name) # import time # import cv2 # cv2.imwrite('/home/snowflake/Desktop/big8192-128.png', proj_blocked[0,0, :, :].numpy()*15) # print('proj_blocked.shape') # print(proj_blocked.shape) # time.sleep(1000) n, c, h, w = proj_blocked.size() proj2 = proj.clone() proj = proj.unsqueeze(0) mask_image = proj_mask.unsqueeze(0).unsqueeze(0).float() downsamplings = 4 representations = {} representations['image'] = [] representations['points'] = [] windows_size = 3 # windows size for i in range(downsamplings): proj_chan_group_points = f.unfold(proj_blocked, kernel_size=3, stride=1, padding=1) projmask_chan_group_points = f.unfold(mask_image, kernel_size=3, stride=1, padding=1) # Get the mean point (taking apart non-valid points) proj_chan_group_points_sum = torch.sum(proj_chan_group_points, dim=1) projmask_chan_group_points_sum = torch.sum( projmask_chan_group_points, dim=1) proj_chan_group_points_mean = proj_chan_group_points_sum / projmask_chan_group_points_sum # tile it for being able to substract it to the other points tiled_proj_chan_group_points_mean = proj_chan_group_points_mean.unsqueeze( 1).repeat(1, windows_size * windows_size, 1) # remove nans due to empty blocks is_nan = tiled_proj_chan_group_points_mean != tiled_proj_chan_group_points_mean tiled_proj_chan_group_points_mean[is_nan] = 0. # compute valid mask per point tiled_projmask_chan_group_points = ( 1 - projmask_chan_group_points.repeat(n, 1, 1)).byte() # substract mean point to points proj_chan_group_points_relative = proj_chan_group_points - tiled_proj_chan_group_points_mean # set to zero points which where non valid at the beginning proj_chan_group_points_relative[ tiled_projmask_chan_group_points] = 0. # compute distance (radius) to mean point # xyz_relative = proj_chan_group_points_relative[1:4,...] # relative_distance = torch.norm(xyz_relative, dim=0).unsqueeze(0) # NOW proj_chan_group_points_relative HAS Xr, Yr, Zr, Rr, Dr relative to the mean point proj_norm_chan_group_points = f.unfold(proj.permute(1, 0, 2, 3), kernel_size=3, stride=1, padding=1) # NOW proj_norm_chan_group_points HAS X, Y, Z, R, D. Now we have to concat them both proj_chan_group_points_combined = torch.cat( [proj_norm_chan_group_points, proj_chan_group_points_relative], dim=0) # convert back to image for image-convolution-branch proj_out = f.fold(proj_chan_group_points_combined, proj_blocked.shape[-2:], kernel_size=3, stride=1, padding=1) proj_out = proj_out.squeeze(1) proj = nn.functional.interpolate(proj, size=(int(proj.shape[2] / 2), int(proj.shape[3] / 2)), mode='nearest') proj_blocked = nn.functional.interpolate( proj_blocked.permute(1, 0, 2, 3), size=(int(proj_blocked.shape[2] / 2), int(proj_blocked.shape[3] / 2)), mode='nearest').permute(1, 0, 2, 3) mask_image = nn.functional.interpolate( mask_image, size=(int(mask_image.shape[2] / 2), int(mask_image.shape[3] / 2)), mode='nearest') representations['points'].append(proj_chan_group_points_combined) representations['image'].append(proj_out) # print('append' +str(i)) # # print(proj_chan_group_points_combined.shape) # print(proj_out.shape) return proj2, proj_mask, proj_labels, unproj_labels, path_seq, path_name, proj_x, proj_y, proj_range, unproj_range, proj_xyz, unproj_xyz, proj_remission, unproj_remissions, unproj_n_points, representations
def overlap_add(X, stride): n_fft = X.shape[1] output_len = n_fft + stride * (X.shape[2] - 1) return fold(X, (1, output_len), kernel_size=(1, n_fft), stride=stride).flatten(1)
def rmac_hist(inp, L_min=7, #7 for fixed width, 1 for all L=7, nb_bins=8, eps=1e-7): ''' https://github.com/filipradenovic/cnnimageretrieval-pytorch/blob/master/cirtorch/layers/functional.py#L26 ''' # x = inp.clone().detach() x = torch.empty_like(inp).copy_(inp) with torch.no_grad(): ovr = 0.4 # desired overlap of neighboring regions steps = torch.LongTensor([2, 3, 4, 5, 6, 7]) # possible regions for the long dimension W = x.size(3) H = x.size(2) w = min(W, H) w2 = math.floor(w/2.0 - 1) b = (max(H, W)-w)/(steps-1) (tmp, idx) = torch.min(torch.abs(((w**2 - w*b)/w**2)-ovr), 0) # steps(idx) regions for long dimension # region overplus per dimension Wd = 0; Hd = 0; if H < W: Wd = idx.item() + 1 elif H > W: Hd = idx.item() + 1 v = [feat_map_shape_hist(x)] # v = v / (torch.norm(v, p=2, dim=1, keepdim=True) + eps).expand_as(v) for l in range(L_min, L+1): wl = math.floor(2*w/(l+1)) wl2 = math.floor(wl/2 - 1) if l+Wd == 1: b = 0 else: b = (W-wl)/(l+Wd-1) cenW_tmp = wl2 + torch.Tensor(range(l-1+Wd+1)).long()*b cenW_tmp = cenW_tmp.float() cenW = torch.floor(cenW_tmp) - wl2 # center coordinates if l+Hd == 1: b = 0 else: b = (H-wl)/(l+Hd-1) cenH_tmp = wl2 + torch.Tensor(range(l-1+Wd+1)).long()*b cenH_tmp = cenH_tmp.float() cenH = torch.floor(cenH_tmp) - wl2 # center coordinates # print(cenH, cenW,wl, wl2) vt_array = [] for i_ in cenH.tolist(): for j_ in cenW.tolist(): if wl == 0: continue R = x[:,:,(int(i_)+torch.LongTensor(range(wl)).to(device)).tolist(),:] R = R[:,:,:,(int(j_)+torch.LongTensor(range(wl)).to(device)).tolist()] vt = feat_map_shape_hist(R) # vt = torch.histc(R, bins=nb_bins, min=R.min().item(), max=R.max().item()) vt = vt / (torch.norm(vt, p=2, dim=-1, keepdim=True) + eps).expand_as(vt) # v += vt vt_array+=[vt] vt_array = torch.stack(vt_array, -1) arr_along_batch = [] for batch_id in range(x.shape[0]): arr_along_batch.append(F.fold(vt_array[batch_id,...], (len(cenH.tolist()),len(cenW.tolist())), (1,1))) arr_along_batch = torch.stack(arr_along_batch, 0).cuda() # v += vt_array # print(arr_along_batch.shape) return arr_along_batch
def test_fold(self): inp = torch.randn(3, 20, 20, device='cuda', dtype=self.dtype) inp_folded = F.fold(inp, (4, 5), (1, 1))