def construct_samples(self): corpus_path = os.path.join(self.corpus_dir, self.testing_label_filename) df = pd.read_json(corpus_path) with open('val_ids.txt', 'w') as f: f.write(corpus_path) f.writelines(df['id']) for caption_list, vid in zip( df['caption'], df['id']): # train_label is a list of caption-vid dicts path = os.path.join(self.feat_dir, '{}.npy'.format(vid)) feat = np.load(path) self.feat_dict[vid] = feat for caption in caption_list: # a list of strings if not all(ord(c) < 128 for c in caption): continue # abandon captions with unusual chars token_list = self.captionToTokenList(caption) self.data_obj_list.append( DataObject(myid=vid, caption_list=token_list)) self.data_obj_list = np.array(self.data_obj_list) self.batch_max_size = len(self.data_obj_list) print('[Validation] total data size: ' + str(self.batch_max_size))
def build_val_data_obj_list(self): corpus_path = self.corpus_dir + self.json_filename data_file = pd.read_json(corpus_path) max_size = 0 for i in range(0, len(data_file['caption'])): myid = data_file['id'][i] path = self.feat_dir + myid + '.npy' mydat = np.load(path) str_list = data_file['caption'][i] tmp_list = [] cap_len_list = [] for j in range(0, len(str_list)): seq = text_to_word_sequence(str_list[j], filters=filters, lower=True, split=" ") tmp_list.append(seq) cap_len_list.append(len(seq) + 1) # added <EOS> obj = DataObject(path, myid, tmp_list, cap_len_list) self.dat_dict[myid] = mydat max_size += 1 self.data_obj_list.append(obj) self.data_obj_list = np.array(self.data_obj_list) self.batch_max_size = max_size
def build_train_data_obj_list(self): corpus_path = self.corpus_dir + self.json_filename data_file = pd.read_json(corpus_path) max_size = 0 for i in range(0, len(data_file['caption'])): myid = data_file['id'][i] path = self.feat_dir + myid + '.npy' mydat = np.load(path) str_list = data_file['caption'][i] self.dat_dict[myid] = mydat for j in range(0, len(str_list)): tmp_list = [] cap_len_list = [] seq = text_to_word_sequence(str_list[j], filters=filters, lower=True, split=" ") join = " ".join(seq) tmp_list.append(seq) cap_len_list.append(len(seq) + 1) # added <EOS> !! obj = DataObject(path, myid, tmp_list, cap_len_list) max_size += 1 self.data_obj_list.append(obj) self.data_obj_list = np.array(self.data_obj_list) self.batch_max_size = max_size self.perm = np.arange(self.batch_max_size, dtype=np.int) self.shuffle_perm() print('[Training] total data size: ' + str(max_size))
def build_test_data_obj_list(self): txt = open(self.test_dir + self.id_txt, 'r') print('load txt: ' + self.test_dir + self.id_txt) max_size = 0 for line in txt.readlines(): myid = line.split('\n')[0] path = self.feat_dir + myid + '.npy' mydat = np.load(path) obj = DataObject(path, myid) self.dat_dict[myid] = mydat max_size += 1 self.data_obj_list.append(obj) self.data_obj_list = np.array(self.data_obj_list) self.batch_max_size = max_size
def construct_samples(self): """ collect captions and construct a list of samples :return: """ vid_captions_path = os.path.join(self.corpus_dir, self.training_label_filename) df = pd.read_json( vid_captions_path ) # r'D:\video_captioning\data\MLDS_hw2_data\training_label.json') with open('train_ids.txt', 'w') as f: f.writelines(df['id']) train_caption_list = [] # a list of caption-list for each video for caption_list, vid in zip( df['caption'], df['id']): # train_label is a list of caption-vid dicts train_caption_list.append([]) path = os.path.join(self.feat_dir, '{}.npy'.format(vid)) feat = np.load(path) self.feat_dict[vid] = feat for caption in caption_list: # a list of strings if not all(ord(c) < 128 for c in caption): continue # abandon captions with unusual chars token_list = self.captionToTokenList(caption) train_caption_list[-1].append( token_list) # for the last video's caption-list self.data_obj_list.append( DataObject(myid=vid, caption_list=token_list)) self.data_obj_list = np.array(self.data_obj_list) self.batch_max_size = len(self.data_obj_list) self.perm = np.arange(self.batch_max_size, dtype=np.int) self.shuffle_perm() print('[Training] total data size: ' + str(self.batch_max_size)) # construct vocab self.word_freq_dict = defaultdict(int) # total_word_count = 0.0 for caption_list in train_caption_list: for caption in caption_list: for token in caption: self.word_freq_dict[token] += 1 # total_word_count += 1.0 # also save the testing's vocab df = pd.read_json( os.path.join(self.corpus_dir, self.testing_label_filename)) for caption_list in df['caption']: for caption in caption_list: token_list = self.captionToTokenList(caption) for token in token_list: self.word_freq_dict[token] += 1 # for word in self.word_freq_dict: # self.word_freq_dict[word] /= np.sum(self.word_freq_dict.values()) # return a new list of k-v tuples, sorted by the freq of word (the value), in the ascending order (reverse) # word_freq_list = sorted(iter(self.word_freq_dict.items()), key=lambda k_v: k_v[1], reverse=True) self.idx_to_word = self.marker + list(self.word_freq_dict.keys()) # self.word_index_dict = dict([(self.vocabulary[i], i) for i in range(len(self.vocabulary))]) self.vocab_indices = { word: idx for idx, word in enumerate(self.idx_to_word) } # store in pickle with open('vocab_indices.pkl', 'wb') as handle: pickle.dump(self.vocab_indices, handle) with open('idx_to_word.pkl', 'wb') as handle: pickle.dump(self.idx_to_word, handle) with open('word_freq.pkl', 'wb') as handle: pickle.dump(self.word_freq_dict, handle) return len(self.vocab_indices)