def forward(self, video, audio): mask = infer_mask_from_batch_data(video) lengths = infer_lengths_from_mask(mask) inputs = torch.cat([video, audio], dim=2) seq_lengths, perm_idx = lengths.sort(descending=True) _, inverse_idx = perm_idx.sort() inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs[perm_idx], seq_lengths, batch_first=True) states = [] for layer in self._layers: new_inputs, (state1, state2) = layer(inputs) inputs, _ = torch.nn.utils.rnn.pad_packed_sequence( inputs, batch_first=True) new_inputs, _ = torch.nn.utils.rnn.pad_packed_sequence( new_inputs, batch_first=True) inputs = torch.cat([inputs, new_inputs], dim=2) inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs, seq_lengths, batch_first=True) state1 = state1.permute(1, 2, 0).squeeze(2) state2 = state2.permute(1, 2, 0).squeeze(2) state = torch.cat([state1, state2], dim=1) states.append(state) representations = torch.cat(states, dim=1)[inverse_idx] return self._out(representations)
def forward(self, video, audio): mask = infer_mask_from_batch_data(video) lengths = infer_lengths_from_mask(mask) video = self._first_linear_rgb(video) video = F.relu(video) audio = self._first_linear_audio(audio) audio = F.relu(audio) seq_lengths, perm_idx = lengths.sort(descending=True) _, inverse_idx = perm_idx.sort() video = torch.nn.utils.rnn.pack_padded_sequence(video[perm_idx], seq_lengths, batch_first=True) audio = torch.nn.utils.rnn.pack_padded_sequence(audio[perm_idx], seq_lengths, batch_first=True) for layer in self._rgb_layers: new_video, _ = layer(video) video, _ = torch.nn.utils.rnn.pad_packed_sequence(video, batch_first=True) new_video, _ = torch.nn.utils.rnn.pad_packed_sequence( new_video, batch_first=True) video = torch.cat([video, new_video], dim=2) video = torch.nn.utils.rnn.pack_padded_sequence(video, seq_lengths, batch_first=True) for layer in self._audio_layers: new_audio, _ = layer(audio) audio, _ = torch.nn.utils.rnn.pad_packed_sequence(audio, batch_first=True) new_audio, _ = torch.nn.utils.rnn.pad_packed_sequence( new_audio, batch_first=True) audio = torch.cat([audio, new_audio], dim=2) audio = torch.nn.utils.rnn.pack_padded_sequence(audio, seq_lengths, batch_first=True) video, _ = torch.nn.utils.rnn.pad_packed_sequence(video, batch_first=True) audio, _ = torch.nn.utils.rnn.pad_packed_sequence(audio, batch_first=True) rgb_attention_weights = F.softmax(self._rgb_attention(video), dim=1) video = (video * rgb_attention_weights).sum(dim=1) audio_attention_weights = F.softmax(self._audio_attention(audio), dim=1) audio = (audio * audio_attention_weights).sum(dim=1) representations = torch.cat([video, audio], dim=1)[inverse_idx] return self._out(self._bn(representations))
def forward(self, video, audio): mask = infer_mask_from_batch_data(video) lengths = infer_lengths_from_mask(mask) batch = [] for index in range(video.shape[0]): mean_video = video[index, :lengths[index]].mean(0) mean_audio = audio[index, :lengths[index]].mean(0) batch.append(torch.cat([mean_video, mean_audio]).unsqueeze(0)) return self._impl(torch.cat(batch))