Exemple #1
0
    def construct_samples(self):
        corpus_path = os.path.join(self.corpus_dir,
                                   self.testing_label_filename)

        df = pd.read_json(corpus_path)
        with open('val_ids.txt', 'w') as f:
            f.write(corpus_path)
            f.writelines(df['id'])

        for caption_list, vid in zip(
                df['caption'],
                df['id']):  # train_label is a list of caption-vid dicts
            path = os.path.join(self.feat_dir, '{}.npy'.format(vid))
            feat = np.load(path)
            self.feat_dict[vid] = feat

            for caption in caption_list:  # a list of strings
                if not all(ord(c) < 128 for c in caption):
                    continue  # abandon captions with unusual chars
                token_list = self.captionToTokenList(caption)

                self.data_obj_list.append(
                    DataObject(myid=vid, caption_list=token_list))

        self.data_obj_list = np.array(self.data_obj_list)
        self.batch_max_size = len(self.data_obj_list)
        print('[Validation] total data size: ' + str(self.batch_max_size))
Exemple #2
0
    def build_val_data_obj_list(self):

        corpus_path = self.corpus_dir + self.json_filename
        data_file = pd.read_json(corpus_path)
        max_size = 0
        for i in range(0, len(data_file['caption'])):

            myid = data_file['id'][i]
            path = self.feat_dir + myid + '.npy'
            mydat = np.load(path)
            str_list = data_file['caption'][i]

            tmp_list = []
            cap_len_list = []
            for j in range(0, len(str_list)):
                seq = text_to_word_sequence(str_list[j],
                                            filters=filters,
                                            lower=True,
                                            split=" ")
                tmp_list.append(seq)
                cap_len_list.append(len(seq) + 1)  # added <EOS>
            obj = DataObject(path, myid, tmp_list, cap_len_list)
            self.dat_dict[myid] = mydat
            max_size += 1
            self.data_obj_list.append(obj)
        self.data_obj_list = np.array(self.data_obj_list)
        self.batch_max_size = max_size
Exemple #3
0
    def build_train_data_obj_list(self):
        corpus_path = self.corpus_dir + self.json_filename

        data_file = pd.read_json(corpus_path)
        max_size = 0
        for i in range(0, len(data_file['caption'])):

            myid = data_file['id'][i]
            path = self.feat_dir + myid + '.npy'
            mydat = np.load(path)
            str_list = data_file['caption'][i]
            self.dat_dict[myid] = mydat

            for j in range(0, len(str_list)):
                tmp_list = []
                cap_len_list = []

                seq = text_to_word_sequence(str_list[j],
                                            filters=filters,
                                            lower=True,
                                            split=" ")
                join = " ".join(seq)

                tmp_list.append(seq)
                cap_len_list.append(len(seq) + 1)  # added <EOS> !!
                obj = DataObject(path, myid, tmp_list, cap_len_list)
                max_size += 1
                self.data_obj_list.append(obj)

        self.data_obj_list = np.array(self.data_obj_list)
        self.batch_max_size = max_size
        self.perm = np.arange(self.batch_max_size, dtype=np.int)
        self.shuffle_perm()
        print('[Training] total data size: ' + str(max_size))
Exemple #4
0
    def build_test_data_obj_list(self):

        txt = open(self.test_dir + self.id_txt, 'r')
        print('load txt: ' + self.test_dir + self.id_txt)
        max_size = 0

        for line in txt.readlines():

            myid = line.split('\n')[0]
            path = self.feat_dir + myid + '.npy'
            mydat = np.load(path)

            obj = DataObject(path, myid)
            self.dat_dict[myid] = mydat
            max_size += 1
            self.data_obj_list.append(obj)

        self.data_obj_list = np.array(self.data_obj_list)
        self.batch_max_size = max_size
    def construct_samples(self):
        """
        collect captions and construct a list of samples
        :return:
        """
        vid_captions_path = os.path.join(self.corpus_dir,
                                         self.training_label_filename)

        df = pd.read_json(
            vid_captions_path
        )  # r'D:\video_captioning\data\MLDS_hw2_data\training_label.json')

        with open('train_ids.txt', 'w') as f:
            f.writelines(df['id'])

        train_caption_list = []  # a list of caption-list for each video
        for caption_list, vid in zip(
                df['caption'],
                df['id']):  # train_label is a list of caption-vid dicts

            train_caption_list.append([])
            path = os.path.join(self.feat_dir, '{}.npy'.format(vid))
            feat = np.load(path)
            self.feat_dict[vid] = feat

            for caption in caption_list:  # a list of strings
                if not all(ord(c) < 128 for c in caption):
                    continue  # abandon captions with unusual chars
                token_list = self.captionToTokenList(caption)
                train_caption_list[-1].append(
                    token_list)  # for the last video's caption-list
                self.data_obj_list.append(
                    DataObject(myid=vid, caption_list=token_list))

        self.data_obj_list = np.array(self.data_obj_list)
        self.batch_max_size = len(self.data_obj_list)
        self.perm = np.arange(self.batch_max_size, dtype=np.int)
        self.shuffle_perm()
        print('[Training] total data size: ' + str(self.batch_max_size))

        # construct vocab
        self.word_freq_dict = defaultdict(int)
        # total_word_count = 0.0
        for caption_list in train_caption_list:
            for caption in caption_list:
                for token in caption:
                    self.word_freq_dict[token] += 1
                    # total_word_count += 1.0

        # also save the testing's vocab
        df = pd.read_json(
            os.path.join(self.corpus_dir, self.testing_label_filename))
        for caption_list in df['caption']:
            for caption in caption_list:
                token_list = self.captionToTokenList(caption)
                for token in token_list:
                    self.word_freq_dict[token] += 1

        # for word in self.word_freq_dict:
        #     self.word_freq_dict[word] /= np.sum(self.word_freq_dict.values())
        # return a new list of k-v tuples, sorted by the freq of word (the value), in the ascending order (reverse)
        # word_freq_list = sorted(iter(self.word_freq_dict.items()), key=lambda k_v: k_v[1], reverse=True)
        self.idx_to_word = self.marker + list(self.word_freq_dict.keys())
        # self.word_index_dict = dict([(self.vocabulary[i], i) for i in range(len(self.vocabulary))])
        self.vocab_indices = {
            word: idx
            for idx, word in enumerate(self.idx_to_word)
        }

        # store in pickle
        with open('vocab_indices.pkl', 'wb') as handle:
            pickle.dump(self.vocab_indices, handle)
        with open('idx_to_word.pkl', 'wb') as handle:
            pickle.dump(self.idx_to_word, handle)
        with open('word_freq.pkl', 'wb') as handle:
            pickle.dump(self.word_freq_dict, handle)

        return len(self.vocab_indices)