Esempio n. 1
0
class YOUCOOKII(Dataset):
    def __init__(self, features_path, ann_file_path, obj_feat_path,
                 embeddings_path, min_count, train_max_length,
                 test_max_length):

        self.feature_path = features_path
        self.ann_file_path = ann_file_path
        self.obj_feat_path = obj_feat_path

        self.is_training = 'train' in ann_file_path
        print(self.is_training)

        print('loading annotations into memory...', end=" ")
        tic = time.time()
        print(ann_file_path)
        aux = json.load(open(ann_file_path, 'r'))

        self.dataset = aux['annotations']
        self.selected_frames = aux['frame_selected']
        self.mapping_obj = aux['mapping']

        print('Done (t={:0.2f}s)'.format(time.time() - tic))

        self.min_count = min_count
        self.train_max_length = train_max_length
        self.test_max_length = test_max_length

        vocab_file_name = f'youcookII_vocab_{min_count}_{train_max_length}.pickle'

        self.vocab_file_path = vocab_file_name
        self.create_vocab()

        embeddings_file_name = f'youcookII_embeddings_{min_count}_{train_max_length}.pth'
        self.embeddings_file_path = embeddings_file_name
        self.get_embedding_matrix(embeddings_path)

        self.createIndex()
        self.ids = list(self.anns.keys())
        self.epsilon = 1E-10

    def create_vocab(self):
        print(self.vocab_file_path, os.path.exists(self.vocab_file_path))
        if self.is_training:
            if not os.path.exists(self.vocab_file_path):
                print("Creating vocab")
                self.vocab = Vocab(add_bos=False,
                                   add_eos=False,
                                   add_padding=False,
                                   min_count=self.min_count)

                for example in self.dataset:
                    self.vocab.add_tokenized_sentence(
                        example['tokens'][:self.train_max_length])

                self.vocab.finish()

                with open(self.vocab_file_path, 'wb') as f:
                    pickle.dump(self.vocab, f)
            else:
                with open(self.vocab_file_path, 'rb') as f:
                    self.vocab = pickle.load(f)

        else:
            print("Cargando vocab")
            with open(self.vocab_file_path, 'rb') as f:
                self.vocab = pickle.load(f)

    def get_embedding_matrix(self, embeddings_path):
        '''
        Gets you a torch tensor with the embeddings
        in the indices given by self.vocab.

        Unknown (unseen) words are each mapped to a random,
        different vector.


        :param embeddings_path:
        :return:
        '''
        if self.is_training and not os.path.exists(self.embeddings_file_path):
            tic = time.time()

            print('loading embeddings into memory...', end=" ")

            if 'glove' in embeddings_path.lower():
                tmp_file = get_tmpfile("test_word2vec.txt")
                _ = glove2word2vec(embeddings_path, tmp_file)
                embeddings = KeyedVectors.load_word2vec_format(tmp_file)
            else:
                embeddings = KeyedVectors.load_word2vec_format(embeddings_path,
                                                               binary=True)

            print('Done (t={:0.2f}s)'.format(time.time() - tic))

            embedding_matrix = get_embedding_matrix(embeddings, self.vocab)

            with open(self.embeddings_file_path, 'wb') as f:
                torch.save(embedding_matrix, f)

        else:
            with open(self.embeddings_file_path, 'rb') as f:
                embedding_matrix = torch.load(f)

        self.embedding_matrix = embedding_matrix

    def createIndex(self):
        print("Creating index..", end=" ")
        anns = {}
        size = int(round(len(self.dataset) * 1.))
        counter = 0
        for row in self.dataset[:size]:
            if float(row['feature_start']) > float(row['feature_end']):
                print(row)
                continue
            #if math.floor(float(row['feature_start'])) - math.floor(float(row['feature_end'])) == 0:
            # print(row)
            #continue
            if math.floor(float(row['feature_end'])) >= float(
                    row['number_features']):
                row['feature_end'] = float(row['number_features']) - 1

            row['augmentation'] = 0
            anns[counter] = row
            counter += 1

            # if self.is_training == True:
            #     row['augmentation'] = 1
            #     anns[counter] = row.copy()
            #     counter += 1

            # if os.path.exists("/mnt/ssd/crodriguezo/charades_features_full/rgb/{}_flip.npy".format(row['video'])):
            #     row['augmentation'] = 0
            #     row['video'] = "{}_flip".format(row['video'])
            #     anns[counter] = row.copy()
            #     counter += 1

        self.anns = anns
        print(" Ok! {}".format(len(anns.keys())))

    def __getitem__(self, index):
        ann = self.anns[index]
        # print(ann)
        selected_frames = self.selected_frames[ann['video']]

        object_features = []
        human_features = []

        for selected in self.selected_frames[ann['video']]:
            file_path = os.path.join(
                self.obj_feat_path, ann['subset'], ann['recipe'], ann['video'],
                "{}_{}.pkl".format("image",
                                   str(selected).zfill(5)))
            aux_obj = []
            aux_hum = []
            with open(file_path, "rb") as fo:
                obj_feat = pickle.load(fo, encoding='latin1')
                # print(obj_feat.keys())
                for indx, obj_type in enumerate(obj_feat['object_class']):
                    if self.mapping_obj[str(obj_type)]['human']:
                        aux_hum.append(obj_feat['features'][indx])
                    else:
                        aux_obj.append(obj_feat['features'][indx])
            aux_obj = np.array(aux_obj)
            aux_hum = np.array(aux_hum)

            object_features.append(aux_obj)
            human_features.append(aux_hum)

        i3dfeat = "{}/{}.npy".format(self.feature_path, ann['video'])
        i3dfeat = np.load(i3dfeat).astype(np.float32)
        i3dfeat = np.squeeze(i3dfeat)
        i3dfeat = torch.from_numpy(i3dfeat)
        feat_length = i3dfeat.shape[0]

        if self.is_training:
            raw_tokens = ann['tokens'][:self.train_max_length]
        else:
            raw_tokens = ann['tokens'][:self.test_max_length]

        indices = self.vocab.tokens2indices(raw_tokens)
        tokens = [self.embedding_matrix[index] for index in indices]
        tokens = torch.stack(tokens)

        if ann['augmentation'] == 1:
            feature_start = ann['feature_start']
            feature_end = ann['feature_end']

            offset = int(math.floor(feature_start))
            if offset != 0:
                offset = np.random.randint(0, int(round(feature_start)))

            new_feature_start = feature_start - offset
            new_feature_end = feature_end - offset

            i3dfeat = i3dfeat[offset:, :]

            feat_length = ann['number_features'] - offset
            localization = np.zeros(feat_length, dtype=np.float32)

            start = math.floor(new_feature_start)
            end = math.floor(new_feature_end)

            time_start = (new_feature_start * ann['number_frames'] /
                          ann['number_features']) / ann['fps']
            time_end = (new_feature_end * ann['number_frames'] /
                        ann['number_features']) / ann['fps']
            time_offset = (offset * ann['number_frames'] /
                           ann['number_features']) / ann['fps']

        else:
            localization = np.zeros(feat_length, dtype=np.float32)

            # loc_start =
            start = math.floor(ann['feature_start'])
            end = math.floor(ann['feature_end'])
            time_start = ann['time_start']
            time_end = ann['time_end']

        #if end - start == 0:
        #print(ann)

        # print(start, end, feat_length, ann['augmentation'])

        loc_start = np.ones(feat_length, dtype=np.float32) * self.epsilon
        loc_end = np.ones(feat_length, dtype=np.float32) * self.epsilon
        y = (1 - (feat_length - 3) * self.epsilon - 0.5) / 2
        # print(y)
        if start > 0:
            loc_start[start - 1] = y
        if start < feat_length - 1:
            loc_start[start + 1] = y
        loc_start[start] = 0.5

        if end > 0:
            loc_end[end - 1] = y
        if end < feat_length - 1:
            loc_end[end + 1] = y
        loc_end[end] = 0.5

        y = 1.0
        localization[start:end] = y

        return index, i3dfeat, object_features, human_features, tokens, torch.from_numpy(loc_start), torch.from_numpy(loc_end), \
               torch.from_numpy(localization), time_start, time_end, ann['number_frames']/ann['number_features'], ann['fps']

    def __len__(self):
        return len(self.ids)
class ANET_CAP(Dataset):
    def __init__(self, features_path, ann_file_path, embeddings_path,
                 min_count, train_max_length, test_max_length):

        self.feature_path = features_path
        self.ann_file_path = ann_file_path
        self.is_training = 'training' in ann_file_path
        print(self.is_training)

        print('loading annotations into memory...', end=" ")
        tic = time.time()
        self.dataset = json.load(open(ann_file_path, 'r'))

        # self.glove = np.load(vocab_glove, allow_pickle=True).item()
        print('Done (t={:0.2f}s)'.format(time.time() - tic))

        self.min_count = min_count
        self.train_max_length = train_max_length
        self.test_max_length = test_max_length

        vocab_file_name = f'anet_vocab_{min_count}_{train_max_length}.pickle'

        self.vocab_file_path = vocab_file_name
        print(self.vocab_file_path)
        self.create_vocab()

        embeddings_file_name = f'anet_embeddings_{min_count}_{train_max_length}.pth'
        self.embeddings_file_path = embeddings_file_name
        self.get_embedding_matrix(embeddings_path)

        self.createIndex()
        self.ids = list(self.anns.keys())
        self.epsilon = 1E-10

    def tIoU(self, start, end, pred_start, pred_end):
        tt1 = np.maximum(start, pred_start)
        tt2 = np.minimum(end, pred_end)
        # Intersection including Non-negative overlap score.
        segments_intersection = (tt2 - tt1).clip(0)
        # Segment union.
        segments_union = (pred_end - pred_start) \
        + (end - start) - segments_intersection
        # Compute overlap as the ratio of the intersection
        # over union of two segments.
        tIoU = segments_intersection.astype(float) / segments_union
        return tIoU

    def create_vocab(self):

        if self.is_training:
            if not os.path.exists(self.vocab_file_path):
                print("Creating vocab")
                self.vocab = Vocab(add_bos=False,
                                   add_eos=False,
                                   add_padding=False,
                                   min_count=self.min_count)

                for example in self.dataset:
                    self.vocab.add_tokenized_sentence(
                        example['tokens'][:self.train_max_length])

                self.vocab.finish()

                with open(self.vocab_file_path, 'wb') as f:
                    pickle.dump(self.vocab, f)
            else:
                with open(self.vocab_file_path, 'rb') as f:
                    self.vocab = pickle.load(f)

        else:
            print("Cargando vocab")
            with open(self.vocab_file_path, 'rb') as f:
                self.vocab = pickle.load(f)

    def get_embedding_matrix(self, embeddings_path):
        '''
        Gets you a torch tensor with the embeddings
        in the indices given by self.vocab.

        Unknown (unseen) words are each mapped to a random,
        different vector.


        :param embeddings_path:
        :return:
        '''
        if self.is_training and not os.path.exists(self.embeddings_file_path):
            tic = time.time()

            print('loading embeddings into memory...', end=" ")

            if 'glove' in embeddings_path.lower():
                tmp_file = get_tmpfile("test_word2vec.txt")
                _ = glove2word2vec(embeddings_path, tmp_file)
                embeddings = KeyedVectors.load_word2vec_format(tmp_file)
            else:
                embeddings = KeyedVectors.load_word2vec_format(embeddings_path,
                                                               binary=True)

            print('Done (t={:0.2f}s)'.format(time.time() - tic))

            embedding_matrix = get_embedding_matrix(embeddings, self.vocab)

            with open(self.embeddings_file_path, 'wb') as f:
                torch.save(embedding_matrix, f)

        else:
            with open(self.embeddings_file_path, 'rb') as f:
                embedding_matrix = torch.load(f)

        self.embedding_matrix = embedding_matrix

    def createIndex(self):
        print("Creating index..", end=" ")
        anns = {}
        size = int(round(len(self.dataset) * 1.))
        counter = 0
        for row in self.dataset[:size]:

            oIoU = self.tIoU(float(row['feature_start']),
                             float(row['feature_end']), 0,
                             float(row['number_features']))
            if self.is_training:

                #if oIoU > 0.9:
                #continue
                if float(row['number_features']) < 10:
                    continue  # print(row)
                if float(row['number_features']) >= 1200:
                    continue  # print(row)
            if float(row['feature_start']) > float(row['feature_end']):
                # print(row)
                continue
            #if math.floor(float(row['feature_start'])) - math.floor(float(row['feature_end'])) == 0:
            # print(row)
            #continue
            if math.floor(float(row['feature_end'])) >= float(
                    row['number_features']):
                row['feature_end'] = float(row['number_features']) - 1

            if self.is_training:
                #if oIoU < 0.9:
                #if row['feature_start'] > 10:
                row['augmentation'] = 1
                anns[counter] = row.copy()
                counter += 1

            row['augmentation'] = 0
            anns[counter] = row
            counter += 1
        self.anns = anns
        print(" Ok! {}".format(len(anns.keys())))

    def __getitem__(self, index):
        ann = self.anns[index]
        # print(ann)

        i3dfeat = "{}/{}.npy".format(self.feature_path, ann['video'])
        i3dfeat = np.load(i3dfeat)
        i3dfeat = np.squeeze(i3dfeat)
        i3dfeat = torch.from_numpy(i3dfeat)
        feat_length = i3dfeat.shape[0]

        if self.is_training:
            raw_tokens = ann['tokens'][:self.train_max_length]
        else:
            raw_tokens = ann['tokens'][:self.test_max_length]

        indices = self.vocab.tokens2indices(raw_tokens)
        tokens = [self.embedding_matrix[index] for index in indices]
        tokens = torch.stack(tokens)

        if ann['augmentation'] == 1:
            feature_start = ann['feature_start']
            feature_end = ann['feature_end']

            offset = int(math.floor(feature_start))
            if offset != 0:
                offset = np.random.randint(0, int(round(feature_start)))

            new_feature_start = feature_start - offset
            new_feature_end = feature_end - offset

            i3dfeat = i3dfeat[offset:, :]

            feat_length = ann['number_features'] - offset
            localization = np.zeros(feat_length, dtype=np.float32)

            start = math.floor(new_feature_start)
            end = math.floor(new_feature_end)

            time_start = (new_feature_start * ann['number_frames'] /
                          ann['number_features']) / ann['fps']
            time_end = (new_feature_end * ann['number_frames'] /
                        ann['number_features']) / ann['fps']
            time_offset = (offset * ann['number_frames'] /
                           ann['number_features']) / ann['fps']

        else:
            localization = np.zeros(feat_length, dtype=np.float32)

            # loc_start =
            start = math.floor(ann['feature_start'])
            end = math.floor(ann['feature_end'])
            time_start = ann['time_start']
            time_end = ann['time_end']

        #if end - start == 0:
        #print(ann)

        # print(start, end, feat_length, ann['augmentation'])

        loc_start = np.ones(feat_length, dtype=np.float32) * self.epsilon
        loc_end = np.ones(feat_length, dtype=np.float32) * self.epsilon
        y = (1 - (feat_length - 3) * self.epsilon - 0.5) / 2
        # print(y)
        if start > 0:
            loc_start[start - 1] = y
        if start < feat_length - 1:
            loc_start[start + 1] = y
        loc_start[start] = 0.5

        if end > 0:
            loc_end[end - 1] = y
        if end < feat_length - 1:
            loc_end[end + 1] = y
        loc_end[end] = 0.5

        y = 1.0
        localization[start:end] = y

        # return index, i3dfeat, tokens, torch.from_numpy(loc_start), torch.from_numpy(loc_end), torch.from_numpy(localization),\
        #        time_start, time_end, ann['number_frames']/ann['number_features'], ann['fps'] FOR KL

        return index, i3dfeat, tokens, torch.from_numpy(loc_start), torch.from_numpy(loc_end), torch.from_numpy(localization),\
                   time_start, time_end, ann['number_frames']/ann['number_features'], ann['fps']

    def __len__(self):
        return len(self.ids)
class CHARADES_STA(Dataset):

    def __init__(self, features_path,
                       ann_file_path,
                       embeddings_path,
                       min_count,
                       train_max_length,
                       test_max_length):

        self.feature_path = features_path
        self.ann_file_path = ann_file_path
        self.is_training = 'train' in ann_file_path
        print(self.is_training)

        print('loading annotations into memory...', end=" ")
        tic = time.time()
        self.dataset = json.load(open(ann_file_path, 'r'))

        # self.glove = np.load(vocab_glove, allow_pickle=True).item()
        print('Done (t={:0.2f}s)'.format(time.time()- tic))

        self.min_count = min_count
        self.train_max_length = train_max_length
        self.test_max_length = test_max_length

        vocab_file_name = f'charades_vocab_{min_count}_{train_max_length}.pickle'

        self.vocab_file_path = vocab_file_name
        self.create_vocab()

        embeddings_file_name = f'charades_embeddings_{min_count}_{train_max_length}.pth'
        self.embeddings_file_path = embeddings_file_name
        self.get_embedding_matrix(embeddings_path)

        self.createIndex()
        self.sample_rate = 3
        self.ids   = list(self.anns.keys())
        self.epsilon = 1E-10
        # self.visual_features = h5py.File(self.feature_path, 'r')

    def create_vocab(self):
        print(self.vocab_file_path, os.path.exists(self.vocab_file_path))
        if self.is_training:
            if not os.path.exists(self.vocab_file_path):
                print("Creating vocab")
                self.vocab = Vocab(
                    add_bos=False,
                    add_eos=False,
                    add_padding=False,
                    min_count=self.min_count)

                for example in self.dataset:
                    self.vocab.add_tokenized_sentence(example['tokens'][:self.train_max_length])

                self.vocab.finish()

                with open(self.vocab_file_path, 'wb') as f:
                    pickle.dump(self.vocab, f)
            else:
                with open(self.vocab_file_path, 'rb') as f:
                    self.vocab = pickle.load(f)

        else:
            print("Cargando vocab")
            with open(self.vocab_file_path, 'rb') as f:
                self.vocab = pickle.load(f)


    def get_embedding_matrix(self, embeddings_path):
        '''
        Gets you a torch tensor with the embeddings
        in the indices given by self.vocab.

        Unknown (unseen) words are each mapped to a random,
        different vector.


        :param embeddings_path:
        :return:
        '''
        if self.is_training and not os.path.exists(self.embeddings_file_path):
            tic = time.time()

            print('loading embeddings into memory...', end=" ")

            if 'glove' in embeddings_path.lower():
                tmp_file = get_tmpfile("test_word2vec.txt")
                _ = glove2word2vec(embeddings_path, tmp_file)
                embeddings = KeyedVectors.load_word2vec_format(tmp_file)
            else:
                embeddings = KeyedVectors.load_word2vec_format(embeddings_path, binary=True)

            print('Done (t={:0.2f}s)'.format(time.time() - tic))

            embedding_matrix = get_embedding_matrix(embeddings, self.vocab)

            with open(self.embeddings_file_path, 'wb') as f:
                torch.save(embedding_matrix, f)

        else:
            with open(self.embeddings_file_path, 'rb') as f:
                embedding_matrix  = torch.load(f)

        self.embedding_matrix = embedding_matrix


    def createIndex(self):
        print("Creating index..", end=" ")
        anns = {}
        size = int(round(len(self.dataset) * 1.))
        counter = 0
        for row in self.dataset[:size]:
            if float(row['feature_start']) > float(row['feature_end']):
                print(row)
                continue

            if math.floor(float(row['feature_end'])) >= float(row['number_features']):
                row['feature_end'] = float(row['number_features'])-1

            row['augmentation'] = 0
            anns[counter] = row
            counter+=1

        self.anns = anns
        print(" Ok! {}".format(len(anns.keys())))

    def __getitem__(self, index):
        ann = self.anns[index]

        visual_features = h5py.File(self.feature_path, 'r')
        i3dfeat = torch.from_numpy(visual_features[ann['video']][:]).float()
        i3dfeat =  i3dfeat[list(range(0, i3dfeat.shape[0], self.sample_rate))]

        # i3dfeat = "{}/{}.npy".format(self.feature_path, ann['video'])
        # i3dfeat = np.load(i3dfeat)
        # i3dfeat = np.squeeze(i3dfeat)
        # i3dfeat = torch.from_numpy(i3dfeat)
        feat_length = i3dfeat.shape[0]

        if self.is_training:
            raw_tokens = ann['tokens'][:self.train_max_length]
        else:
            raw_tokens = ann['tokens'][:self.test_max_length]

        indices = self.vocab.tokens2indices(raw_tokens)
        tokens = [self.embedding_matrix[index] for index in indices]
        tokens = torch.stack(tokens)

        localization = np.zeros(feat_length, dtype=np.float32)
        start = math.floor(ann['feature_start']/self.sample_rate)
        end   = math.floor(ann['feature_end']/self.sample_rate)
        time_start = ann['time_start']
        time_end = ann['time_end']

        loc_start = np.ones(feat_length, dtype=np.float32) * self.epsilon
        loc_end   = np.ones(feat_length, dtype=np.float32) * self.epsilon
        y = (1 - (feat_length-3) * self.epsilon - 0.5)/ 2

        if start > 0:
            loc_start[start - 1] = y
        if start < feat_length-1:
            loc_start[start + 1] = y
        loc_start[start] = 0.5

        if end > 0:
            loc_end[end - 1] = y
        if end < feat_length-1:
            loc_end[end + 1] = y
        loc_end[end] = 0.5
        # number_features = ann['number_features']//self.sample_rate
        number_features = feat_length
        y = 1.0
        localization[start:end] = y
        #a=(ann['number_frames']/number_features)

        return index, i3dfeat, tokens, torch.from_numpy(loc_start), torch.from_numpy(loc_end), torch.from_numpy(localization),\
               time_start, time_end, ann['number_frames']/number_features, ann['fps'], start,end, 0, ann['video']

    def __len__(self):
        return len(self.ids)