def forward(self, input): input_shape = pytorch_utils.get_shape(input) assert len(input_shape) == 5 dim = self.dim # permute to put the required dimension in the 2nd dimension if dim == 1: x = input elif dim == 2: x = input.permute(0, 2, 1, 3, 4) elif dim == 3: x = input.permute(0, 3, 2, 1, 4) elif dim == 4: x = input.permute(0, 4, 2, 3, 1) # apply batch_norm num_features = pytorch_utils.get_shape(x)[1] assert num_features == self.num_features x = self.layer(x) # permute back to the original view if dim == 2: x = x.permute(0, 2, 1, 3, 4) elif dim == 3: x = x.permute(0, 3, 2, 1, 4) elif dim == 4: x = x.permute(0, 4, 2, 3, 1) x_shape = pytorch_utils.get_shape(x) assert input_shape == x_shape return x
def get_context_class(self, x_cs, x_so, B): x_cs_class = [] # loop on multi_contexts for idx_context in range(self.n_contexts): # embedding of context x_c = x_cs[idx_context] # (B, C, N) x_c = x_c.permute(0, 2, 1) # (B, N, C) # hide N dimension B, N, C = pytorch_utils.get_shape(x_c) x_c = x_c.contiguous().view(B * N, C) # (B*N, C) x_c = torch.cat((x_so, x_c), dim=1) layer = self.classifier_layers x_c = layer(x_c) _, C = pytorch_utils.get_shape(x_c) x_c = x_c.view(B, N, C) # (B, N, C) # append to list of context class predictions x_cs_class.append(x_c.view(1, B, self.N, self.n_classes)) # (1, B,N, C) # Process context features to get context category from x_cs features x_cs_class = torch.stack(x_cs_class, dim=0).view( -1, B, self.N, self.n_classes) # (n_context, B, N, C) return x_cs_class
def forward(self, input): input_shape = pytorch_utils.get_shape(input) assert len(input_shape) == 2 dim = self.dim # permute to put the required dimension in the 2nd dimension if dim == 0: x = input.permute(1, 0) else: x = input # apply batch_norm num_features = pytorch_utils.get_shape(x)[1] assert num_features == self.num_features x = self.layer(x) # permute back to the original view if dim == 0: x = x.permute(1, 0) x_shape = pytorch_utils.get_shape(x) assert input_shape == x_shape return x
def forward(self, input): # input is of shape (None, H, W, N, T) input_shape = pytorch_utils.get_shape(input) b, h, w, n, t = input_shape assert len(input_shape) == 5 # reshape tensor = input.permute(0, 1, 2, 4, 3) # (None, H, W, T. N) tensor = tensor.contiguous().view(b * h * w * t, n) # (None*H*W*T, N) # sample gumbel noise gumbel_shape = tensor.size() gumbel_noise = self.gumbel_sampler.sample(gumbel_shape).cuda() # gumbel_noise = self.sample_gumbel(gumbel_shape) # gumbel sigmoid trick tensor = (tensor + gumbel_noise) / self.temperature tensor = self.sigmoid(tensor) # get original size and permutation tensor = tensor.view(b, h, w, t, n) # (None, H, W, T, N) tensor = tensor.permute(0, 1, 2, 4, 3) # (None, H, W, N, T) return tensor
def forward(self, x): """ :param x: (B, C, T, H, W) :return: """ K = self.window_size # padd the input x_padded = self.padding(x) # get how many local windows or slices (S) B, C, T, H, W = pytorch_utils.get_shape(x_padded) S = T - K + 1 N = self.n_heads tensors = [] # loop on windows, and get them for idx_slice in range(S): idx_start = idx_slice idx_stop = idx_start + K # slice to get the window x_window = x_padded[:, :, idx_start:idx_stop] tensors.append(x_window) # now that you get the windows, stack them into a new dimension y = torch.stack(tensors, dim=1) # (B, S, C, T, H, W) # reshape to hide the slices inside the batch dimension y = y.view(B * S, C, K, H, W) # (B*S, C, T, H, W) z = [] # feed to to local-attentions block, multi-heads for idx_head in range(N): head_num = idx_head + 1 attention_head_name = 'attention_head_%d' % (head_num) attention_head = getattr(self, attention_head_name) z_head = attention_head(y) # (B*S, C, T, H, W) z.append(z_head) # concat z = torch.cat(z, dim=1) # (B*S, C, H, W) # reshape to get back slices z = z.view(B, S, C, H, W) # (B*S, C, H, W) # permute to put slices in the temporal dimension z = z.permute(0, 2, 1, 3, 4) # residual z += x return z
def forward(self, input): input_shape = pytorch_utils.get_shape(input) assert len(input_shape) == 5 dim = self.dim # permute to put the required dimension in the 2nd dimension if dim == 4: x = input elif dim == 3: x = input.permute(0, 1, 2, 4, 3) elif dim == 2: x = input.permute(0, 1, 4, 3, 2) elif dim == 1: x = input.permute(0, 4, 2, 3, 1) else: x = None B, d1, d2, d3, d4 = pytorch_utils.get_shape(x) assert d4 == self.num_features # reshape x = x.view(B, d1 * d2 * d3, d4) # apply layer_norm x = self.layer(x) # reshape back to the original view x = x.view(B, d1, d2, d3, d4) # permute back to the original view if dim == 3: x = x.permute(0, 1, 2, 4, 3) elif dim == 2: x = x.permute(0, 1, 4, 3, 2) elif dim == 1: x = x.permute(0, 4, 2, 3, 1) x_shape = pytorch_utils.get_shape(x) assert input_shape == x_shape return x
def forward(self, *input): """ input is two features: subject-object feature and context feature :param x_so: pairattn feature (B, C, N, H, W) :param x_c: scene feature (B, C, N, H, W) :return: """ # return x_so embeddings x_so = input[0] x_so = self.dense_so(x_so) B, C, N, _, _ = pytorch_utils.get_shape(x_so) x_cs = input[1:] # return context embeddings x_c = self.get_context_embeddings(x_cs, B) x = x_so # spatial pooling x = self.spatial_pooling(x) # (B, C, N) x = x.permute(0, 2, 1) # (B, N, C) # hide N dimension B, N, C = pytorch_utils.get_shape(x) x_action = x.contiguous().view(B * N, C) # (B*N, C) # return context categories x_cs_classes = self.get_context_class(x_c, x_action, B) # (nco, B, N, C) x, _ = self.modulate_context_classifier(x_so, x_c, x_cs_classes, B) # (B, N, 600) # Add modulated response to human-object classifier and max-pool over N x, _ = torch.max(x, dim=1) # (B, C) x = torch.sigmoid(x) return x
def __save_values_for_debugging(self, f, alpha): is_training = self.training if is_training: return self.f_mean = torch.mean(f) self.f_std = torch.std(f) non_zero = torch.sum(alpha).item() sum = np.prod(pytorch_utils.get_shape(alpha)) ratio = non_zero / sum self.alpha_ratio = ratio
def forward(self, x): """ :param x: (B, C, T, H, W) :return: """ batch_size = x.size(0) x_shape = pytorch_utils.get_shape(x) B, C, T, H, W = x_shape # key embedding key = self.key_embedding(x) # (B, C, T, H, W) key = key.view(batch_size, self.n_channels_inter, -1) # (B, C, T*H*W) key = key.permute(0, 2, 1) # (B, T*H*W, C) # query embedding query = self.query_embedding(x) # (B, C, T, H, W) query = query.view(batch_size, self.n_channels_inter, -1) # (B, C, T*H*W) # value embedding value = self.value_embedding(x) # (B, C, T, H, W) value = value.view(batch_size, self.n_channels_inter, -1) # (B, C, T*H*W) value = value.permute(0, 2, 1) # (B, T*H*W, C) # attention alpha = torch.matmul(key, query) # (B, T*H*W, T*H*W) # normalize over timesteps alpha = alpha / float(T) # use softmax or sigmoid if self.is_softmax_activation: alpha = F.softmax(alpha, dim=-1) # (B, T*H*W, T*H*W) else: alpha = alpha / alpha.size(-1) # (B, T*H*W, T*H*W) alpha = F.sigmoid(alpha) # (B, T*H*W, T*H*W) # multiply alpha with values y = torch.matmul(alpha, value) # (B, T*H*W, C) y = y.permute(0, 2, 1).contiguous() # (B, C, T*H*W) y = y.view(batch_size, self.n_channels_inter, T, H, W) # (B, C, T, H, W) # output embedding y = self.output_embedding(y) # residual connection y += x return y
def forward(self, x_window): """ :param x: (B, C, T, H, W) :return: """ B, C, T, H, W = pytorch_utils.get_shape(x_window) batch_size = x_window.size(0) assert T % 2 == 1 # get middle item of the window idx_item = int(T / 2.0) x_item = x_window[:, :, idx_item:idx_item + 1] # (B, C, 1, H, W) # query embedding query = self.query_embedding(x_item) # (B, C, 1, H, W) query = query.view(batch_size, self.n_channels_inter, -1) # (B, C, 1*H*W) # key embedding key = self.key_embedding(x_window) # (B, C, T, H, W) key = key.view(batch_size, self.n_channels_inter, -1) # (B, C, T*H*W) key = key.permute(0, 2, 1) # (B, T*H*W, C) # value embedding value = self.value_embedding(x_window) # (B, C, T, H, W) value = value.view(batch_size, self.n_channels_inter, -1) # (B, C, T*H*W) value = value.permute(0, 2, 1) # (B, T*H*W, C) # attention alpha = torch.matmul(key, query) # (B, T*H*W, 1*H*W) alpha = alpha.permute(0, 2, 1) # (B, 1*H*W, T*H*W) alpha = F.softmax(alpha, dim=-1) # (B, 1*H*W, T*H*W) # scale over channels or over the timesteps # alpha = alpha / np.sqrt(self.n_channels_inter) # (B, 1*H*W, T*H*W) # alpha = alpha / alpha.size(-1) # (B, 1*H*W, T*H*W) # use sigmoid instead of softmax # alpha = F.sigmoid(alpha) # (B, 1*H*W, T*H*W) # multiply alpha with values y = torch.matmul(alpha, value) # (B, 1*H*W, C) y = y.permute(0, 2, 1).contiguous() # (B, C, 1*H*W) y = y.view(batch_size, self.n_channels_inter, 1, H, W) # (B, C, 1, H, W) # output embedding y = self.output_embedding(y) return y
def forward(self, input): # input is of shape (None, H, W, N, T) input_shape = pytorch_utils.get_shape(input) b, h, w, n, t = input_shape assert len(input_shape) == 5 # sample gumbel noise gumbel_shape = input.size() gumbel_noise = self.gumbel_sampler.sample(gumbel_shape).cuda() # gumbel sigmoid trick tensor = (input + gumbel_noise) / self.temperature tensor = self.sigmoid(tensor) return tensor
def get_context_relevance(self, x_so, x_cs): x_cs_value = [] B, C, N, _, _ = pytorch_utils.get_shape(x_so) # loop on multi_contexts for idx_context in range(self.n_contexts): # embedding of context x_c = x_cs[idx_context] x_c = x_c.view(B, C, N, 1, 1) x_c = self.feature_selection(x_so, x_c) # (B, N) x_cs_value.append(x_c.view(1, B, N)) # (1, B, C) x_cs_value = torch.stack(x_cs_value, dim=0).view(self.n_contexts, B, N) # (num_context, B, N) return x_cs_value
def forward(self, x): x_shape = pytorch_utils.get_shape(x) # (None, 2) assert len(x_shape) == 2 assert x_shape[1] == 2 # x_hard as zero list x_hard = torch.zeros_like(x) # find index of max value _, idx = torch.max(x, dim=1, keepdim=True) # set max value to one x_hard.scatter_(1, idx, 1) # ser gradients to be w.r.t x instead of being w.r.t x_hard y = (x_hard - x).detach() + x return y
def forward(self, x_so, x_c): # pairwise interaction between x_so and x_c f = torch.cat((x_so, x_c), dim=1) # (B, C, N, H, W) # gating f = self.f_layers(f) # (B, N) alpha = f # save values for debugging self.__save_values_for_debugging(f, alpha) # multiply the gating value by the context feature B, N = pytorch_utils.get_shape(alpha) alpha = alpha.view(B, N, 1) # (B, N, 1) return alpha
def forward(ctx, input): """ # input shape (B, T). Hardmax on the node dimension (dim=1) """ input_shape = pytorch_utils.get_shape(input) B, T = input_shape rng = torch.arange(B) # find idx of max idx = torch.argmax(input, dim=1) # set all but max to zero, set max to 1 mask = torch.zeros_like(input) # (B, T) mask[rng, idx] = 1.0 # save for backward pass ctx.mask = mask output = input.clone() # copy input output = output * mask # (B, T) return output
def forward(self, input): # input is of shape (None, C, T, H, W) input_shape = pytorch_utils.get_shape(input) n, c, t, h, w = input_shape assert len(input_shape) == 5 # transpose and reshape to hide the spatial dimension, only expose the temporal dimension for depthwise conv tensor = input.permute(0, 3, 4, 1, 2) # (None, H, W, C, T) tensor = tensor.contiguous().view(n * h * w, c, t) # (None*H*W, C, T) # depthwise conv on the temporal dimension tensor = self.padding(tensor) tensor = self.depthwise_conv(tensor) # (None*H*W, C, T) # reshape to get the spatial dimensions tensor = tensor.view(n, h, w, c, t) # (None, H, W, C, T) # finally, transpose to get the desired output shape tensor = tensor.permute(0, 3, 4, 1, 2) # (None, C, T, H, W) return tensor
def forward(self, input): B = pytorch_utils.get_shape(input)[0] new_shape = [B] + list(self.shape) output = input.view(*new_shape) return output