Exemple #1
0
 def __getitem__(self, idx):
     """ Return audio, video, text and the Kinetics label"""
     if self.pretrained_text:
         a, v, t = np.load(self.a_paths[idx]), np.load(
             self.v_paths[idx]), np.load(self.t_paths[idx])
         a, v = augment_audio(a), augment_video(v)
         a, v, t = torch.from_numpy(a), torch.from_numpy(v).to(
             dtype=torch.float), torch.from_numpy(t)
         label = self.get_label(self.a_paths[idx])
         url = self.t_paths[idx].split('/')[-1].split('.')[0]
         return a, v, t, label, url
         # return 0, 0, t, label, url
     else:
         a, v, t = np.load(self.a_paths[idx]), np.load(
             self.v_paths[idx]), self.text_dict[self.t_paths[idx]]
         a, v = augment_audio(a), augment_video(v)
         # a, v = torch.from_numpy(a).to(dtype=torch.float), torch.from_numpy(v).to(dtype=torch.float)
         a, v = a.to(dtype=torch.float), v.to(dtype=torch.float)
         tokens = list(
             itertools.chain(*[self.start_token] +
                             list(self.sp_id_generator(t)) +
                             [self.end_token]))[:self.context_length]
         t = torch.zeros(self.context_length, dtype=torch.long)
         t[:len(tokens)] = torch.tensor(tokens).flatten()
         label = self.get_label(self.a_paths[idx])
     # print(a.shape)
     # print(v.shape)
     # print(t.shape)
     return a, v, t, label, self.t_paths[idx]
Exemple #2
0
    def __getitem__(self, idx):
        """ Return audio, video, text features from npy files"""

        # try:
        a, v = np.load(self.a_paths[idx]), np.load(self.v_paths[idx])
        a = augment_audio(a)
        v = augment_video(v)
        a, v = a.to(dtype=torch.float), v.to(dtype=torch.float)

        # v1, v2 = augment_views(v)
        # a, v1, v2 = a.to(dtype=torch.float), v1.to(dtype=torch.float), v2.to(dtype=torch.float)

        if self.pretrained_text:
            url = self.t_paths[idx].split('/')[-1].split('.')[0]
            t = torch.from_numpy(np.load(
                self.t_paths[idx])).to(dtype=torch.float)
        else:
            url = self.t_paths[idx]
            t = self.text_dict[url]

        # except:
        #     print(f"Error at Idx: {idx}")
        #     return torch.zeros(80, 512), torch.zeros(16, 128, 128, 3), torch.zeros(1, 512), self.t_paths[idx], 0

        t, t_mask = self.process_text(url) if not self.pretrained_text else (t,
                                                                             1)
        # a = torch.rand(80, 512)
        # v = torch.rand(16, 256, 256, 3)

        return a, v, t, url, t_mask
Exemple #3
0
    def __getitem__(self, i):
        path = self.data[i]
        label = self.classes.index(path.split('/')[-2])

        v = np.load(path)
        v = augment_video(v)
        v = v.to(dtype=torch.float)
        return torch.zeros(80, 512), v, torch.zeros(128), label, ""
Exemple #4
0
    def get_lava_features(self,
                    mp4_path=None,
                    text_input=None,
                    wav_path=None,
                    save=False,
                    save_dir=None,
                    run_lava=True):

        """
        Args:
            mp4_path: (str) input path to mp4 file to extract video and audio streams (if any)
            text: (str) (optional) text metadata for a video to be encoded using GUSE
            wav_path: (str) (optional) path to a wav file for the audio stream
            save: (bool) whether or not to save the extracted features
            save_dir: (str) path to the file to be saved (include train, val)
        Return:
            If SAVE is false, will return audio, video and text features (if any) produced by pre-trained LAVA.
            Otherwise, will save features to numpy files.
            Note: if a given input only has video, no corresponding text or audio numpy files will be available for that input.
        """

        a, v, t = None, None, None

        # loading data for audio, video, text (if any)
        # if wav_path:
        #     a = get_audio_from_wav(wav_path)
        # else:
        #     a = get_audio_from_mp4(mp4_path)

        if mp4_path:
            v = get_video_from_mp4(mp4_path)
            v = torch.from_numpy(augment_video(v, eval=True))

        if text_input:
            t = self.process_text(text_input)

        # encoding data with lava to get features (if any)
        if run_lava:
            with torch.no_grad():
                if a is not None:
                    a = self.encode_audio(a.unsqueeze(0)).squeeze().detach().cpu().numpy()
                if v is not None:
                    v = self.encode_video(v.to(dtype=torch.float).unsqueeze(0)).squeeze().detach().cpu().numpy()
                if t is not None:
                    t = self.encode_text(t.unsqueeze(0)).squeeze().detach().cpu().numpy()
        if save:
            filename = (mp4_path.split('/')[-1]).split('.')[0]
            a_path = '{}/audio'.format(save_dir)
            v_path = '{}/video'.format(save_dir)
            t_path = '{}/text'.format(save_dir)

            self.save_npy_file(features=a, dir=a_path, filename=filename) if a is not None else None
            self.save_npy_file(features=v, dir=v_path, filename=filename) if v is not None else None
            self.save_npy_file(features=t, dir=t_path, filename=filename) if t is not None else None

        return a, v, t
Exemple #5
0
    def __getitem__(self, i):
        try:
            path = self.data[i]
            label = self.classes.index(path.split('/')[-2])

            v = np.load(path)
            v = augment_video(v)
            v = v.to(dtype=torch.float)
            return v, label
        except:
            return torch.zeros(16, 224, 224, 3), 0
Exemple #6
0
    def __getitem__(self, idx):
        """ Return audio, video, text features from npy files"""

        path = self.paths[idx]
        url = path.split(f'{self.prefix}/')[-1][:-4]
        a, v = get_audio_from_mp4(path,
                                  save=False), get_video_from_mp4(path,
                                                                  save=False)

        a = augment_audio(a).to(dtype=torch.float)
        v = augment_video(v).to(dtype=torch.float)
        t, t_mask = self.process_text(url)
        return a, v, t, url, t_mask