def _process_input_sind_lmdb(self, data_dir, split='train'): # Some lmdb configuration. lmdb_dir_fc = os.path.join(self.data_dir, split, 'fea_vgg16_fc7_lmdb_lmdb') lmdb_dir_conv = os.path.join(self.data_dir, split, 'imgs_resized_vgg16_conv5_3_lmdb_lmdb') lmdb_env_fc = lmdb.open(lmdb_dir_fc, readonly=True) lmdb_env_conv = lmdb.open(lmdb_dir_conv, readonly=True) split_dir = os.path.join(data_dir, split) anno_fn = os.path.join(split_dir, 'annotions_filtered.txt') # Now load the stories. dict_story = {} with open(anno_fn, 'r') as fid: for aline in fid: parts = aline.strip().split() flickr_id = parts[0] sid = int(parts[2]) slid = int(parts[3]) if sid not in dict_story: dict_story[sid] = {} dict_story[sid][slid] = [] dict_story[sid][slid].append(flickr_id) inp_v = [] #inp_v = [ utils.process_word2(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # word_vector_size = self.word_vector_size, # to_return = 'word2vec') for w in parts[4:] ] inp_y = [ utils.process_word2(word=w, word2vec=self.word2vec, vocab=self.vocab, word_vector_size=self.word_vector_size, to_return='index', silent=True) for w in parts[4:] ] dict_story[sid][slid].append(inp_v) dict_story[sid][slid].append(inp_y) # Just in case, we sort all the stories in line. for sid in dict_story: story = dict_story[sid].items() sorted(story, key=lambda x: x[0]) story = story[::-1] dict_story[sid] = story return dict_story, lmdb_env_fc, lmdb_env_conv
def _process_batch_sind(self, batch_index, split='train'): # Now, randomly select one story. #logging.info('before batch ...') start_index = self.batch_size * batch_index split_story = None if split == 'train': split_lmdb_env_fc = self.train_lmdb_env_fc split_lmdb_env_conv = self.train_lmdb_env_conv split_story = self.train_story split_dict_story = self.train_dict_story else: split_lmdb_env_fc = self.test_lmdb_env_fc split_lmdb_env_conv = self.test_lmdb_env_conv split_story = self.test_story split_dict_story = self.test_dict_story # make sure it's small than the number of stories. start_index = start_index % len(split_story) # make sure there is enough for a batch. start_index = min(start_index, len(split_story) - self.batch_size) # Now, we select the stories. stories = split_story[start_index:start_index + self.batch_size] # slids.append( random.choice(range(len(split_dict_story[sid])))) max_inp_len = 0 max_q_len = 1 # just be 1. max_ans_len = 0 for sid in stories: max_inp_len = max(max_inp_len, len(split_dict_story[sid]) - 1) #max_q_len = max(max_q_len, split_dict_story[sid][slid][1]) for slid in split_dict_story[sid]: max_ans_len = max(max_ans_len, len(slid[-1][-1])) max_ans_len += 1 # this is for the start token. # in our case, it is pretty similar to the word-level dmn, questions = [] # batch x story_len x fea inputs = [] answers = [] answers_inp = [] answers_mask = [] max_key_len = 12 with split_lmdb_env_fc.begin() as txn_fc: with split_lmdb_env_conv.begin() as txn_conv: for sid in stories: inp = [] # story_len x patches x fea. anno = split_dict_story[sid] question = [] answer = [] answer_mask = [] answer_inp = [] for slid in split_dict_story[sid]: input_anno = slid img_id = input_anno[1][0] while len(img_id) < max_key_len: img_id = '0' + img_id fc_raw = txn_fc.get(img_id.encode('ascii')) fc_fea = caffe.proto.caffe_pb2.Datum() fc_fea.ParseFromString(fc_raw) question.append( np.fromstring(fc_fea.data, dtype=np.float32)) # Now, it is the inputs, we can use the other images other than current one. conv_raw = txn_conv.get(img_id.encode('ascii')) conv_datum = caffe.proto.caffe_pb2.Datum() conv_datum.ParseFromString(conv_raw) conv_fea = np.fromstring(conv_datum.data, dtype=np.float32) x = conv_fea.reshape(conv_datum.channels, conv_datum.height, conv_datum.width) # 512 x 14 x 14 x = x.reshape(conv_datum.channels, conv_datum.height * conv_datum.width) x = x.swapaxes(0, 1) inp.append(x) #now for answer. a = [] a.append(self.vocab_size) # start token. a.extend(input_anno[1] [2]) # this is the index for the captions. a_inp = np.zeros((max_ans_len, self.word_vector_size), dtype=floatX) a_mask = [] # add the start token firstly a_inp[0, :] = utils.process_word2( word="#START#", word2vec=self.word2vec, vocab=self.vocab, word_vector_size=self.word_vector_size, to_return='word2vec') for ans_idx, w_idx in enumerate(a[1:]): a_inp[ans_idx + 1, :] = utils.process_word2( word=self.ivocab[w_idx], word2vec=self.word2vec, vocab=self.vocab, word_vector_size=self.word_vector_size, to_return='word2vec') a_mask = [1 for i in range(len(a) - 1)] while len(a) < max_ans_len: # this does not matter. a.append(-1) a_mask.append(0) a = a[1:] answer.append(np.array(a).astype(np.int32)) answer_mask.append(np.array(a_mask).astype(np.int32)) answer_inp.append(a_inp) question = np.stack(question, axis=0) questions.append(question) inp = np.stack(inp, axis=0) # #story_len x patches x fea inputs.append(inp) answer = np.stack(answer, axis=0) # story_len x max_answer_len answers.append(answer) answer_mask = np.stack( answer_mask, axis=0) # story_len x max_answer_len -1 answers_mask.append(answer_mask) answer_inp = np.stack(answer_inp, axis=0) # story_len x max_answer_len answers_inp.append(answer_inp) # Finally, we transform them into numpy array. inputs = np.stack(inputs, axis=0) inputs = np.array(inputs).astype(floatX) #questions = np.array(questions).astype(floatX) questions = np.stack(questions, axis=0) questions = np.array(questions).astype(floatX) answers = np.array(answers).astype(np.int32) answers_mask = np.array(answers_mask).astype(floatX) #print answers_mask answers_inp = np.stack(answers_inp, axis=0) questions = np.reshape( questions, (questions.shape[0] * questions.shape[1], questions.shape[2])) answers = np.reshape( answers, (answers.shape[0] * answers.shape[1], answers.shape[2])) answers_inp = np.reshape(answers_inp, (answers_inp.shape[0] * answers_inp.shape[1], answers_inp.shape[2], answers_inp.shape[3])) answers_mask = np.reshape( answers_mask, (answers_mask.shape[0] * answers_mask.shape[1], answers_mask.shape[2])) #print inputs.shape #print questions.shape #print answers.shape #print answers_inp.shape #print answers_mask.shape #logging.info('after batch ...') return inputs, questions, answers, answers_inp, answers_mask
def _process_input_sind(self, data_dir, split = 'train'): split_dir = os.path.join(data_dir, split) fea_dir = os.path.join(split_dir, 'fea_vgg16_fc7') anno_fn = os.path.join(split_dir,'annotions_filtered_fixed.txt') # Now load the stories. dict_story = {} with open(anno_fn ,'r') as fid: for aline in fid: parts = aline.strip().split() flickr_id = parts[0] sid = int(parts[2]) slid = int(parts[3]) if sid not in dict_story: dict_story[sid] = {} dict_story[sid][slid] = [] dict_story[sid][slid].append(flickr_id) inp_v = [] #inp_v = [ utils.process_word2(word = w, # word2vec = self.word2vec, # vocab = self.vocab, # word_vector_size = self.word_vector_size, # to_return = 'word2vec') for w in parts[4:] ] inp_y = [ utils.process_word2(word = w, word2vec = self.word2vec, vocab = self.vocab, word_vector_size = self.word_vector_size, to_return = 'index', silent=True) for w in parts[4:] ] dict_story[sid][slid].append( inp_v ) dict_story[sid][slid].append( inp_y ) # Just in case, we sort all the stories in line. t_keys = dict_story.keys() for sid in t_keys: story = dict_story[sid].items() sorted(story, key = lambda x: x[0]) dict_story[sid] = story # Load all features into memory. features = None num_imgs = 0 fns_dict = {} total_fea = 0 total_fns = 0 for root, dirs, fns in os.walk(fea_dir, followlinks = True): for fn in fns: full_fn = os.path.join(root, fn) hdf_f = h5py.File(full_fn,'r') fea = hdf_f['fea'][:] fns = hdf_f['fns'][:] total_fea += fea.shape[0] total_fns += fns.shape[0] assert fea.shape[0] == fns.shape[0], "Should not happen, we have re-runed the feature extraction." hdf_f.close() logging.info('total fea = %d, fns = %d', total_fea, total_fns) for root, dirs, fns in os.walk(fea_dir, followlinks=True): for fn in fns: full_fn = os.path.join(root, fn) hdf_f = h5py.File(full_fn,'r') fea = hdf_f['fea'][:] fns = hdf_f['fns'][:] hdf_f.close() if features is None: shape = [total_fea] self.cnn_dim = fea.size / fea.shape[0] shape.extend(fea.shape[1:]) features = np.zeros(shape) features[0:fea.shape[0],:] = fea else: features[num_imgs:num_imgs+fea.shape[0],:] = fea for i in range(fns.shape[0]): bfn = os.path.basename(fns[i]) key = os.path.splitext(bfn)[0] key = key.split('_')[0] fns_dict[key] = num_imgs num_imgs += 1 logging.info("Done loading features from %s", fea_dir) return dict_story, features, fns_dict, num_imgs