Ejemplo n.º 1
0
    def forward(self, video, audio):
        mask = infer_mask_from_batch_data(video)
        lengths = infer_lengths_from_mask(mask)

        inputs = torch.cat([video, audio], dim=2)

        seq_lengths, perm_idx = lengths.sort(descending=True)
        _, inverse_idx = perm_idx.sort()
        inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs[perm_idx],
                                                         seq_lengths,
                                                         batch_first=True)

        states = []
        for layer in self._layers:
            new_inputs, (state1, state2) = layer(inputs)
            inputs, _ = torch.nn.utils.rnn.pad_packed_sequence(
                inputs, batch_first=True)
            new_inputs, _ = torch.nn.utils.rnn.pad_packed_sequence(
                new_inputs, batch_first=True)

            inputs = torch.cat([inputs, new_inputs], dim=2)
            inputs = torch.nn.utils.rnn.pack_padded_sequence(inputs,
                                                             seq_lengths,
                                                             batch_first=True)
            state1 = state1.permute(1, 2, 0).squeeze(2)
            state2 = state2.permute(1, 2, 0).squeeze(2)
            state = torch.cat([state1, state2], dim=1)
            states.append(state)

        representations = torch.cat(states, dim=1)[inverse_idx]
        return self._out(representations)
Ejemplo n.º 2
0
    def forward(self, video, audio):
        mask = infer_mask_from_batch_data(video)
        lengths = infer_lengths_from_mask(mask)

        video = self._first_linear_rgb(video)
        video = F.relu(video)

        audio = self._first_linear_audio(audio)
        audio = F.relu(audio)

        seq_lengths, perm_idx = lengths.sort(descending=True)
        _, inverse_idx = perm_idx.sort()
        video = torch.nn.utils.rnn.pack_padded_sequence(video[perm_idx],
                                                        seq_lengths,
                                                        batch_first=True)
        audio = torch.nn.utils.rnn.pack_padded_sequence(audio[perm_idx],
                                                        seq_lengths,
                                                        batch_first=True)

        for layer in self._rgb_layers:
            new_video, _ = layer(video)
            video, _ = torch.nn.utils.rnn.pad_packed_sequence(video,
                                                              batch_first=True)
            new_video, _ = torch.nn.utils.rnn.pad_packed_sequence(
                new_video, batch_first=True)

            video = torch.cat([video, new_video], dim=2)
            video = torch.nn.utils.rnn.pack_padded_sequence(video,
                                                            seq_lengths,
                                                            batch_first=True)

        for layer in self._audio_layers:
            new_audio, _ = layer(audio)
            audio, _ = torch.nn.utils.rnn.pad_packed_sequence(audio,
                                                              batch_first=True)
            new_audio, _ = torch.nn.utils.rnn.pad_packed_sequence(
                new_audio, batch_first=True)

            audio = torch.cat([audio, new_audio], dim=2)
            audio = torch.nn.utils.rnn.pack_padded_sequence(audio,
                                                            seq_lengths,
                                                            batch_first=True)

        video, _ = torch.nn.utils.rnn.pad_packed_sequence(video,
                                                          batch_first=True)
        audio, _ = torch.nn.utils.rnn.pad_packed_sequence(audio,
                                                          batch_first=True)

        rgb_attention_weights = F.softmax(self._rgb_attention(video), dim=1)
        video = (video * rgb_attention_weights).sum(dim=1)

        audio_attention_weights = F.softmax(self._audio_attention(audio),
                                            dim=1)
        audio = (audio * audio_attention_weights).sum(dim=1)

        representations = torch.cat([video, audio], dim=1)[inverse_idx]
        return self._out(self._bn(representations))
Ejemplo n.º 3
0
    def forward(self, video, audio):
        mask = infer_mask_from_batch_data(video)
        lengths = infer_lengths_from_mask(mask)

        batch = []
        for index in range(video.shape[0]):
            mean_video = video[index, :lengths[index]].mean(0)
            mean_audio = audio[index, :lengths[index]].mean(0)
            batch.append(torch.cat([mean_video, mean_audio]).unsqueeze(0))

        return self._impl(torch.cat(batch))