def __init__(self, questions_h5, vocab, num_frames=1, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) np.random.seed() self.split = split self.gpu_id = gpu_id self.num_frames = num_frames self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.map_resolution = map_resolution print('Reading question data into memory') self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['actions']) self.actions = self.actions.unsqueeze(2) self.robot_positions = _dataset_to_tensor( questions_h5['robot_positions'], dtype=np.float32) self.action_images = questions_h5['images'] self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) self.action_masks = _dataset_to_tensor(questions_h5['mask']) #if input_type != 'ques': ''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda()
def __init__(self, questions_h5, vocab, num_frames=1, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) np.random.seed() self.split = split self.gpu_id = gpu_id self.num_frames = num_frames self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.map_resolution = map_resolution print('Reading question data into memory') self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['actions']) self.actions = self.actions.unsqueeze(2) self.robot_positions = _dataset_to_tensor(questions_h5['robot_positions'],dtype = np.float32) self.action_images = questions_h5['images'] self.action_maps = questions_h5['heatmaps'] self.action_lengths = _dataset_to_tensor(questions_h5['action_lengths']) self.action_masks = _dataset_to_tensor(questions_h5['mask']) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda()
def test(rank): nav_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} nav_model = NavPlannerControllerModel(**nav_model_kwargs) nav_checkpoint = torch.load(args.nav_weight) #load checkpoint weights nav_model.load_state_dict(nav_checkpoint['state']) #create model print('--- nav_model loaded checkpoint ---') cnn_kwargs = {'num_classes': 191, 'pretrained': True} cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() cnn.cuda() #create cnn model vqa_model_kwargs = {'vocab': load_vocab(args.vocab_json)} vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs) vqa_checkpoint = torch.load(args.vqa_weight) #load checkpoint weights vqa_model.load_state_dict(vqa_checkpoint['state']) print('--- vqa_model loaded checkpoint ---') # need cnn? scene = "test-10-obj-100.txt" my_env = enviroment.Environment(is_testing=0, testing_file=scene) object_exist_list = my_env.ur5.object_type print("Objetcts that exist: ") print(object_exist_list) #create simulation enviroment my_question = Qusetion(object_exist_list) #create testing question testing_questions = my_question.createQueue() vocab = my_question.create_vocab() for question in testing_questions: planner_hidden = None max_action = 30 position = [0, 0] action_in_raw = [0] #start action_in actions = [] print(question['question']) #question questionTokens = my_question.tokenize(question['question'], punctToRemove=['?'], addStartToken=False) encoded_question_raw = my_question.encode(questionTokens, vocab['questionTokenToIdx']) encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) print(encoded_question) action_times = 0 push_signal = 0 push_point = 0 while (action_times < max_action): #print(planner_img_feats_var.size()) action_in_tensor = _dataset_to_tensor(action_in_raw) action_in = Variable(action_in_tensor) action_in = action_in.unsqueeze(0) action_in = action_in.unsqueeze(0) _, rgb_image_raw = my_env.camera.get_camera_data() #before position_in, planner_img_feats_var = data2input( position, rgb_image_raw, cnn) output_data, planner_hidden = nav_model.planner_step( encoded_question, planner_img_feats_var, action_in, position_in, planner_hidden) planner_possi = F.log_softmax(output_data, dim=1) planner_data = planner_possi.data.numpy() planner_data = planner_data[0] action_out = np.where(planner_data == np.max(planner_data)) action_out = action_out[0][0] actions.append(action_out) action_in_raw = [action_out] if action_out == 9: print('stop') break elif action_out == 0: push_signal = 1 push_point = action_times else: dx, dy = order2action(action_out) position[0] += dx position[1] += dy action_times += 1 if len(actions) > 2 and push_signal == 0: action_position = position + position my_env.UR5_action(action_position, 2) #sucking elif len(actions) > 2 and push_signal == 1: #pushing position_start = [0, 0] position_end = [0, 0] for i in range(len(actions)): if i <= push_point: #the first step dx, dy = order2action(actions[i]) position_start[0] += dx position_start[1] += dy position_end[0] += dx position_end[1] += dy else: #the second step dx, dy = order2action(actions[i]) position_end[0] += dx position_end[1] += dy action_position = position_start + position_end my_env.UR5_action(action_position, 1) #pushing # get image after actions _, rgb_image_after = my_env.camera.get_camera_data( ) # image after actions shrink = cv.resize(rgb_image_raw, (224, 224), interpolation=cv.INTER_AREA) shrink = np.array(shrink) shrink = shrink.transpose((2, 0, 1)) shrink = shrink.reshape(1, 3, 224, 224) shrink = (shrink / 255.0).astype(np.float32) images = torch.FloatTensor(shrink) images = Variable(images) images = images.unsqueeze(0) # process images # answer question in vqa now # encoded_question already done scores, _ = vqa_model(images, encoded_question) scores = scores.data.numpy() scores = scores[0] answer_predict = np.where(scores == np.max(scores)) answer_predict = answer_predict[0][0] if answer_predict == 0: print('--- Predict: Exists not') elif answer_predict == 1: print('--- Predict: Exists') else: raise Exception('Prediction neither 0 nor 1')
def test(rank): act_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} act_model = actPlannerBaseModel(**act_model_kwargs) act_checkpoint = torch.load(args.nav_weight) #load checkpoint weights act_model.load_state_dict(act_checkpoint['state']) #create model print('--- act_model loaded checkpoint ---') res_model_dir = os.path.abspath("../train/models/resnet101.pth") my_map_cnn = mapCNN(checkpoint_path=res_model_dir) map_checkpoint = torch.load('mapcnn.pt', map_location='cpu') #load checkpoint weights my_map_cnn.load_state_dict(map_checkpoint['state']) #create map model print('--- map_model loaded checkpoint ---') cnn_model_dir = os.path.abspath("../train/models/03_13_h3d_hybrid_cnn.pt") cnn_kwargs = { 'num_classes': 191, 'pretrained': True, 'checkpoint_path': cnn_model_dir } cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() vocab_dir = os.path.abspath("vocab.json") vocab_file = open(vocab_dir, 'r', encoding='utf-8') vocab = json.load(vocab_file) planner_hidden = None max_action = 30 position = [0, 0] action_in_raw = [0] #start action_in actions = [] question = args.question print(question) questionTokens = tokenize(question, punctToRemove=['?'], addStartToken=False) encoded_question_raw = encode(questionTokens, vocab['questionTokenToIdx']) while (len(encoded_question_raw) < 10): encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) #print(encoded_question) action_times = 0 push_signal = 0 push_point = 0 crop_w_offset = 470 crop_w = 840 crop_h_offset = 280 crop_h = 690 rgb_before = cv.imread(args.rgb_image_before_dir) depth_before = cv.imread(args.depth_image_before_dir) rgb_before_crop = rgb_before[crop_h_offset:crop_h_offset + crop_h, crop_w_offset:crop_w_offset + crop_w] depth_before_crop = depth_before[crop_h_offset:crop_h_offset + crop_h, crop_w_offset:crop_w_offset + crop_w] depth_before_crop = depth_before_crop[0] cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop) rgb_dim = rgb_before.shape rgb_crop_dim = rgb_before_crop.shape # print(depth_dim) # print(depth_crop_dim) depth_before = depth_before[0] rgb_before_resize = cv.resize(rgb_before_crop, (256, 256), interpolation=cv.INTER_AREA) depth_before_resize = cv.resize(depth_before_crop, (256, 256), interpolation=cv.INTER_AREA) ''' print(depth_np.max()) print(depth_np[0][5]) print('.....') print(depth_np[1][5]) print('.....') print(depth_np[2][5]) print('.....') ''' rgb_tensor, depth_tensor = rgbd2tensor( rgb_before_resize, depth_before_resize) #output_heatmap heatmap_output = rgbd2heatmap(rgb_tensor, depth_tensor, my_map_cnn) f = h5py.File(args.heatmap_output_dir, 'w') f['heatmap'] = heatmap_output cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize) cv.imwrite(args.depth_image_before_dir_, depth_before_resize) while (action_times < max_action): #print(planner_img_feats_var.size()) action_in_tensor = _dataset_to_tensor(action_in_raw) action_in = Variable(action_in_tensor) action_in = action_in.unsqueeze(0) action_in = action_in.unsqueeze(0) position_in, planner_img_feats_var = data2input( position, rgb_before_resize, cnn) output_data, planner_hidden = act_model.planner_step( encoded_question, planner_img_feats_var, action_in, position_in, planner_hidden) planner_possi = F.log_softmax(output_data, dim=1) planner_data = planner_possi.data.numpy() planner_data = planner_data[0] action_out = np.where(planner_data == np.max(planner_data)) action_out = action_out[0][0] actions.append(action_out) action_in_raw = [action_out] if action_out == 9: #print('stop') break elif action_out == 0: push_signal = 1 push_point = action_times else: dx, dy = order2action(action_out) position[0] += dx position[1] += dy action_times += 1 if len(actions) > 2 and push_signal == 0: #action_position = position+position print('\n -- suck in {} '.format(position)) # convert to correct position # crop_position = (int(position[0] / 256 * rgb_dim[1] - crop_w_offset), # int(position[1] / 256 * rgb_dim[0] - crop_h_offset)) crop_position = (int(position[0] / 256 * rgb_crop_dim[1]), int(position[1] / 256 * rgb_crop_dim[0])) # draw a red cross at position on cropped rgb rgb_before_crop = cv.drawMarker(rgb_before_crop, crop_position, (0, 0, 255), markerType=cv.MARKER_CROSS, markerSize=50, thickness=5, line_type=cv.LINE_AA) cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop) # draw the same on rgb_before_resize for comparison # convert to correct position crop_position_ = (int(position[0]), int(position[1])) # draw a red cross at position on cropped rgb rgb_before_resize = cv.drawMarker(rgb_before_resize, crop_position_, (0, 0, 255), markerType=cv.MARKER_CROSS, markerSize=10, thickness=5, line_type=cv.LINE_AA) cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize) elif len(actions) > 2 and push_signal == 1: #pushing position_start = [0, 0] position_end = [0, 0] for i in range(len(actions)): if i <= push_point: #the first step dx, dy = order2action(actions[i]) position_start[0] += dx position_start[1] += dy position_end[0] += dx position_end[1] += dy else: #the second step dx, dy = order2action(actions[i]) position_end[0] += dx position_end[1] += dy #action_position = position_start+position_end print('\n -- Push from {} to {}'.format(position_start, position_end)) # convert to correct position # crop_position_start = (int(position_start[0] / 256 * rgb_dim[1] - crop_w_offset), # int(position_start[1] / 256 * rgb_dim[0] - crop_h_offset)) # crop_position_end = (int(position_end[0] / 256 * rgb_dim[1] - crop_w_offset), # int(position_end[1] / 256 * rgb_dim[0] - crop_h_offset)) crop_position_start = (int(position_start[0] / 256 * rgb_crop_dim[1]), int(position_start[1] / 256 * rgb_crop_dim[0])) crop_position_end = (int(position_end[0] / 256 * rgb_crop_dim[1]), int(position_end[1] / 256 * rgb_crop_dim[0])) # draw a red, 10pt arrow from position_start to position_end on cropped rgb rgb_before_crop = cv.arrowedLine(rgb_before_crop, crop_position_start, crop_position_end, (0, 0, 255), thickness=3, line_type=cv.LINE_AA) cv.imwrite(args.rgb_crop_before_dir_, rgb_before_crop) # draw the same on rgb_before_resize for comparison # convert to correct position crop_position_start_ = (int(position_start[0]), int(position_start[1])) crop_position_end_ = (int(position_end[0]), int(position_end[1])) # draw a red cross at position on cropped rgb rgb_before_resize = cv.arrowedLine(rgb_before_resize, crop_position_start_, crop_position_end_, (0, 0, 255), thickness=3, line_type=cv.LINE_AA) cv.imwrite(args.rgb_image_before_dir_, rgb_before_resize) else: print('\n -- No action')
def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000, overfit=False, max_controller_actions=5, max_actions=None): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames self.max_controller_actions = max_controller_actions np.random.seed() self.data_json = data_json self.split = split self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.overfit = overfit self.to_cache = to_cache self.img_data_cache = {} print('Reading question data into memory from', questions_h5) self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) print('... finished running dataset_to_tensor operations from', questions_h5) if max_actions: #max actions will allow us to create arrays of a certain length. Helpful if you only want to train with 10 actions. print('... entering max_actions conditions block from', questions_h5) assert isinstance(max_actions, int) num_data_items = self.actions.shape[0] new_actions = np.zeros((num_data_items, max_actions + 2), dtype=np.int64) new_lengths = np.ones( (num_data_items, ), dtype=np.int64) * max_actions for i in range(num_data_items): action_length = int(self.action_lengths[i]) new_actions[i, 0] = 1 new_actions[i, 1:max_actions + 1] = self.actions[ i, action_length - max_actions:action_length].numpy() self.actions = torch.LongTensor(new_actions) self.action_lengths = torch.LongTensor(new_lengths) print('... finished running max_actions conditions block from', questions_h5) if self.data_json != False: print('... entering data_json false condition block from', questions_h5) data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() if self.overfit: self.env_idx = self.env_idx[:1] self.env_set = self.env_list = [ self.envs[x] for x in self.env_idx ] print('Trying to overfit to [house %s]' % self.env_set[0]) logging.info('Trying to overfit to [house {}]'.format( self.env_set[0])) print(questions_h5, 'Total envs: %d' % len(list(set(self.envs)))) print( questions_h5, 'Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] if max_actions: for i in range(len(self.pos_queue)): self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:] print('... finished running data_json false condition block from', questions_h5) if input_type == 'pacman': print('... entering input_type pacman condition block from', questions_h5) self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=self.actions[i][:self.action_lengths[i] + 1], controller_action_lim=max_controller_actions) self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa) - 1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append(cq_idx) self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx) print( '... finished running input_type pacman condition block from', questions_h5) print('... finished instantiating EqaDataset from', questions_h5)
class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000, overfit=False, max_controller_actions=5, max_actions=None): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames self.max_controller_actions = max_controller_actions np.random.seed() self.data_json = data_json self.split = split self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.overfit = overfit self.to_cache = to_cache self.img_data_cache = {} print('Reading question data into memory from', questions_h5) self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) print('... finished running dataset_to_tensor operations from', questions_h5) if max_actions: #max actions will allow us to create arrays of a certain length. Helpful if you only want to train with 10 actions. print('... entering max_actions conditions block from', questions_h5) assert isinstance(max_actions, int) num_data_items = self.actions.shape[0] new_actions = np.zeros((num_data_items, max_actions + 2), dtype=np.int64) new_lengths = np.ones( (num_data_items, ), dtype=np.int64) * max_actions for i in range(num_data_items): action_length = int(self.action_lengths[i]) new_actions[i, 0] = 1 new_actions[i, 1:max_actions + 1] = self.actions[ i, action_length - max_actions:action_length].numpy() self.actions = torch.LongTensor(new_actions) self.action_lengths = torch.LongTensor(new_lengths) print('... finished running max_actions conditions block from', questions_h5) if self.data_json != False: print('... entering data_json false condition block from', questions_h5) data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() if self.overfit: self.env_idx = self.env_idx[:1] self.env_set = self.env_list = [ self.envs[x] for x in self.env_idx ] print('Trying to overfit to [house %s]' % self.env_set[0]) logging.info('Trying to overfit to [house {}]'.format( self.env_set[0])) print(questions_h5, 'Total envs: %d' % len(list(set(self.envs)))) print( questions_h5, 'Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] if max_actions: for i in range(len(self.pos_queue)): self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:] print('... finished running data_json false condition block from', questions_h5) if input_type == 'pacman': print('... entering input_type pacman condition block from', questions_h5) self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=self.actions[i][:self.action_lengths[i] + 1], controller_action_lim=max_controller_actions) self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa) - 1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append(cq_idx) self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx) print( '... finished running input_type pacman condition block from', questions_h5) print('... finished instantiating EqaDataset from', questions_h5) def _pick_envs_to_load(self, split='train', max_envs=10, start_idx=0, in_order=False): if split in ['val', 'test'] or in_order == True: pruned_env_set = self.env_set[start_idx:start_idx + max_envs] else: if max_envs < len(self.env_set): env_inds = np.random.choice(len(self.env_set), max_envs, replace=False) else: env_inds = np.random.choice(len(self.env_set), max_envs, replace=True) pruned_env_set = [self.env_set[x] for x in env_inds] return pruned_env_set def _load_envs(self, start_idx=-1, in_order=False): #self._clear_memory() if start_idx == -1: start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1 # Pick envs self.pruned_env_set = self._pick_envs_to_load( split=self.split, max_envs=self.max_threads_per_gpu, start_idx=start_idx, in_order=in_order) if len(self.pruned_env_set) == 0: return # Load api threads start = time.time() if len(self.api_threads) == 0: for i in range(self.max_threads_per_gpu): self.api_threads.append( objrender.RenderAPIThread(w=224, h=224, device=self.gpu_id)) self.cfg = load_config('../../House3D/tests/config.json') print('[%.02f] Loaded %d api threads' % (time.time() - start, len(self.api_threads))) start = time.time() # Load houses from multiprocessing import Pool _args = ([h, self.cfg, self.map_resolution] for h in self.pruned_env_set) with Pool(len(self.pruned_env_set)) as pool: self.all_houses = pool.starmap(local_create_house, _args) print('[%.02f] Loaded %d houses' % (time.time() - start, len(self.all_houses))) start = time.time() # Load envs self.env_loaded = {} for i in range(len(self.all_houses)): print('[%02d/%d][split:%s][gpu:%d][house:%s]' % (i + 1, len(self.all_houses), self.split, self.gpu_id, self.all_houses[i].house['id'])) environment = Environment(self.api_threads[i], self.all_houses[i], self.cfg) self.env_loaded[self.all_houses[i].house['id']] = House3DUtils( environment, target_obj_conn_map_dir=self.target_obj_conn_map_dir, build_graph=False) # [TODO] Unused till now self.env_ptr = -1 print('[%.02f] Loaded %d house3d envs' % (time.time() - start, len(self.env_loaded))) # Mark available data indices self.available_idx = [ i for i, v in enumerate(self.env_list) if v in self.env_loaded ] # [TODO] only keeping legit sequences # needed for things to play well with old data temp_available_idx = self.available_idx.copy() for i in range(len(temp_available_idx)): if self.action_lengths[temp_available_idx[i]] < 5: self.available_idx.remove(temp_available_idx[i]) print('Available inds: %d' % len(self.available_idx)) # Flag to check if loaded envs have been cycled through or not # [TODO] Unused till now self.all_envs_loaded = False def _clear_api_threads(self): for i in range(len(self.api_threads)): del self.api_threads[0] self.api_threads = [] def _clear_memory(self): if hasattr(self, 'episode_house'): del self.episode_house if hasattr(self, 'env_loaded'): del self.env_loaded if hasattr(self, 'api_threads'): del self.api_threads self.api_threads = [] def _check_if_all_envs_loaded(self): print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache), len(self.env_list))) if len(self.img_data_cache) == len(self.env_list): self.available_idx = [i for i, v in enumerate(self.env_list)] return True else: return False def set_camera(self, e, pos, robot_height=1.0): assert len(pos) == 4 e.env.cam.pos.x = pos[0] e.env.cam.pos.y = robot_height e.env.cam.pos.z = pos[2] e.env.cam.yaw = pos[3] e.env.cam.updateDirection() def render(self, e): return e.env.render() def get_frames(self, e, pos_queue, preprocess=True): if isinstance(pos_queue, list) == False: pos_queue = [pos_queue] res = [] for i in range(len(pos_queue)): self.set_camera(e, pos_queue[i]) img = np.array(self.render(e), copy=False, dtype=np.float32) if preprocess == True: img = img.transpose(2, 0, 1) img = img / 255.0 res.append(img) return np.array(res) def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0, max_controller_actions=5): action_length = len(actions) - 1 pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=actions, controller_action_lim=max_controller_actions) # count how many actions of same type have been encountered pefore starting navigation backtrack_controller_steps = actions[1:action_length - backtrack_steps + 1:][::-1] counter = 0 # Removed try/except here to try to tease out pdb-related errors in Abhishek's code that are firing in other # parts of training for me as well. # try: if len(backtrack_controller_steps) > 0: # Edited condition: counter <= len(backtrack_controller_steps to strictly less than to avoid out of bounds # error on following loop; unsure what cascading problems that might cause since I don't know the downsteam # logic for how counter is used, but the loop as written was asking for a bug in execution get getting it. # I also reversed the order of the conditions so that the index check is -after- the verification that # counter is within bounds, since otherwise it doesn't fire until after the out of bounds error has # happened (tho, again, maybe this will cause downstream issues if counter is supposed to be allowed to # float up to value len(backtrack_controller_steps) + 1, which is now higher than it can reach). while ((counter <= self.max_controller_actions) and (counter < len(backtrack_controller_steps)) and (backtrack_controller_steps[counter] == backtrack_controller_steps[0])): counter += 1 # except: # import pdb; # pdb.set_trace() #If you have breakpoint here, you probably found an error in the logit above to figure out the correct counter step. Still working on this and checking. target_pos_idx = action_length - backtrack_steps controller_step = True if target_pos_idx in pq_idx: controller_step = False pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx] pa_pruned = pa[:len(pq_idx_pruned) + 1] images = self.get_frames(self.episode_house, self.episode_pos_queue, preprocess=True) raw_img_feats = self.cnn(Variable( torch.FloatTensor(images).cuda())).data.cpu().numpy().copy() controller_img_feat = torch.from_numpy( raw_img_feats[target_pos_idx].copy()) controller_action_in = pa_pruned[-1] - 2 planner_img_feats = torch.from_numpy( raw_img_feats[pq_idx_pruned].copy()) planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1) return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \ controller_img_feat, self.episode_pos_queue[target_pos_idx], counter def __getitem__(self, index): # [VQA] question-only if self.input_type == 'ques': idx = self.idx[index] question = self.questions[index] answer = self.answers[index] return (idx, question, answer) # [VQA] question+image elif self.input_type == 'ques,image': index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1:action_length + 1] if self.to_cache == True and index in self.img_data_cache: images = self.img_data_cache[index] else: pos_queue = self.pos_queue[index][ -self.num_frames:] # last 5 frames images = self.get_frames(self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) if self.to_cache == True: self.img_data_cache[index] = images.copy() return (idx, question, answer, images, actions_in, actions_out, action_length) # [NAV] question+cnn elif self.input_type in ['cnn', 'cnn+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames(self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) img_feats = self.cnn(Variable( torch.FloatTensor( images).cuda())).data.cpu().numpy().copy() if self.to_cache == True: self.img_data_cache[index] = img_feats # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] actions_in = actions[:action_length] actions_out = actions[1:action_length + 1] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # if action_length is n # images.shape[0] is also n # actions[0] is <START> # actions[n] is <END> # grab 5 random frames # [NOTE]: this'll break for longer-than-5 navigation sequences start_idx = np.random.choice(img_feats.shape[0] + 1 - self.num_frames) img_feats = img_feats[start_idx:start_idx + self.num_frames] actions_in = actions[start_idx:start_idx + self.num_frames] actions_out = actions[start_idx + self.num_frames] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # [NAV] question+lstm elif self.input_type in ['lstm', 'lstm+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor( images).cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy( ) if self.to_cache == True: self.img_data_cache[index] = img_feats actions_in = actions.clone() - 1 actions_out = actions[1:].clone() - 2 actions_in[action_length:].fill_(0) mask = actions_out.clone().gt(-1) if len(actions_out) > action_length: actions_out[action_length:].fill_(0) # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] return (idx, question, answer, False, actions_in, actions_out, action_length, mask) return (idx, question, answer, img_feats, actions_in, actions_out, action_length, mask) # [NAV] planner-controller elif self.input_type in ['pacman']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] planner_actions = self.planner_actions[index] controller_actions = self.controller_actions[index] planner_action_length = self.planner_action_lengths[index] controller_action_length = self.controller_action_lengths[index] planner_hidden_idx = self.planner_hidden_idx[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor( images).cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy( ) if self.to_cache == True: self.img_data_cache[index] = img_feats if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break if target_obj_id == False or target_room == False: return None self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] return (idx, question, answer, actions, action_length) planner_pos_queue_idx = self.planner_pos_queue_idx[index] controller_pos_queue_idx = self.controller_pos_queue_idx[index] planner_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) planner_img_feats[:planner_action_length] = img_feats[ planner_pos_queue_idx] planner_actions_in = planner_actions.clone() - 1 planner_actions_out = planner_actions[1:].clone() - 2 planner_actions_in[planner_action_length:].fill_(0) planner_mask = planner_actions_out.clone().gt(-1) if len(planner_actions_out) > planner_action_length: planner_actions_out[planner_action_length:].fill_(0) controller_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) controller_img_feats[:controller_action_length] = img_feats[ controller_pos_queue_idx] controller_actions_in = actions[1:].clone() - 2 if len(controller_actions_in) > controller_action_length: controller_actions_in[controller_action_length:].fill_(0) controller_out = controller_actions controller_mask = controller_out.clone().gt(-1) if len(controller_out) > controller_action_length: controller_out[controller_action_length:].fill_(0) # zero out forced controller return for i in range(controller_action_length): if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \ (self.max_controller_actions == 1 or controller_out[i - self.max_controller_actions + 1:i].sum() == self.max_controller_actions - 1): controller_mask[i] = 0 return (idx, question, answer, planner_img_feats, planner_actions_in, planner_actions_out, planner_action_length, planner_mask, controller_img_feats, controller_actions_in, planner_hidden_idx, controller_out, controller_action_length, controller_mask) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.available_idx)
def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames np.random.seed() self.data_json = data_json self.split = split self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.to_cache = to_cache self.img_data_cache = {} if self.data_json != False: data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() print('Total envs: %d' % len(list(set(self.envs)))) print('Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] print('Reading question data into memory') self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) if input_type == 'pacman': self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(self.actions[i][:self.action_lengths[i]+1]) self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa)-1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append(cq_idx) self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)
class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames np.random.seed() self.data_json = data_json self.split = split self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.to_cache = to_cache self.img_data_cache = {} if self.data_json != False: data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() print('Total envs: %d' % len(list(set(self.envs)))) print('Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] print('Reading question data into memory') self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) if input_type == 'pacman': self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(self.actions[i][:self.action_lengths[i]+1]) self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa)-1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append(cq_idx) self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx) def _pick_envs_to_load(self, split='train', max_envs=10, start_idx=0, in_order=False): if split in ['val', 'test'] or in_order == True: pruned_env_set = self.env_set[start_idx:start_idx + max_envs] else: if max_envs < len(self.env_set): env_inds = np.random.choice( len(self.env_set), max_envs, replace=False) else: env_inds = np.random.choice( len(self.env_set), max_envs, replace=True) pruned_env_set = [self.env_set[x] for x in env_inds] return pruned_env_set def _load_envs(self, start_idx=-1, in_order=False): if start_idx == -1: start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1 # Pick envs self.pruned_env_set = self._pick_envs_to_load( split=self.split, max_envs=self.max_threads_per_gpu, start_idx=start_idx, in_order=in_order) if len(self.pruned_env_set) == 0: return # Load api threads start = time.time() if len(self.api_threads) == 0: for i in range(len(self.pruned_env_set)): self.api_threads.append( objrender.RenderAPIThread( w=224, h=224, device=self.gpu_id)) self.cfg = load_config('../House3D/tests/config.json') print('[%.02f] Loaded %d api threads' % (time.time() - start, len(self.api_threads))) start = time.time() # Load houses from multiprocessing import Pool _args = ([h, self.cfg, self.map_resolution] for h in self.pruned_env_set) with Pool(len(self.pruned_env_set)) as pool: self.all_houses = pool.starmap(local_create_house, _args) print('[%.02f] Loaded %d houses' % (time.time() - start, len(self.all_houses))) start = time.time() # Load envs self.env_loaded = {} for i in range(len(self.all_houses)): print('[%02d/%d][split:%s][gpu:%d][house:%s]' % (i + 1, len(self.all_houses), self.split, self.gpu_id, self.all_houses[i].house['id'])) self.env_loaded[self.all_houses[i].house['id']] = House3DUtils( Environment(self.api_threads[i], self.all_houses[i], self.cfg), target_obj_conn_map_dir=self.target_obj_conn_map_dir, build_graph=False) # [TODO] Unused till now self.env_ptr = -1 print('[%.02f] Loaded %d house3d envs' % (time.time() - start, len(self.env_loaded))) # Mark available data indices self.available_idx = [ i for i, v in enumerate(self.env_list) if v in self.env_loaded ] print('Available inds: %d' % len(self.available_idx)) # Flag to check if loaded envs have been cycled through or not # [TODO] Unused till now self.all_envs_loaded = False def _clear_api_threads(self): for i in range(len(self.api_threads)): del self.api_threads[0] self.api_threads = [] def _check_if_all_envs_loaded(self): print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache), len(self.env_list))) if len(self.img_data_cache) == len(self.env_list): self.available_idx = [i for i, v in enumerate(self.env_list)] return True else: return False def set_camera(self, e, pos, robot_height=1.0): assert len(pos) == 4 e.env.cam.pos.x = pos[0] e.env.cam.pos.y = robot_height e.env.cam.pos.z = pos[2] e.env.cam.yaw = pos[3] e.env.cam.updateDirection() def render(self, e): return e.env.render() def get_frames(self, e, pos_queue, preprocess=True): if isinstance(pos_queue, list) == False: pos_queue = [pos_queue] res = [] for i in range(len(pos_queue)): self.set_camera(e, pos_queue[i]) img = np.array(self.render(e), copy=False, dtype=np.float32) if preprocess == True: img = img.transpose(2, 0, 1) img = img / 255.0 res.append(img) return np.array(res) def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0): action_length = len(actions)-1 pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(actions) target_pos_idx = action_length - backtrack_steps controller_step = True if target_pos_idx in pq_idx: controller_step = False pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx] pa_pruned = pa[:len(pq_idx_pruned)+1] images = self.get_frames( self.episode_house, self.episode_pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() controller_img_feat, controller_action_in = False, False if controller_step == True: controller_img_feat = torch.from_numpy(raw_img_feats[target_pos_idx].copy()) controller_action_in = pa_pruned[-1] - 2 planner_img_feats = torch.from_numpy(raw_img_feats[pq_idx_pruned].copy()) planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1) return planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, self.episode_pos_queue[target_pos_idx] def __getitem__(self, index): # [VQA] question-only if self.input_type == 'ques': idx = self.idx[index] question = self.questions[index] answer = self.answers[index] return (idx, question, answer) # [VQA] question+image elif self.input_type == 'ques,image': index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1: action_length + 1] if self.to_cache == True and index in self.img_data_cache: images = self.img_data_cache[index] else: pos_queue = self.pos_queue[index][ -self.num_frames:] # last 5 frames images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) if self.to_cache == True: self.img_data_cache[index] = images.copy() return (idx, question, answer, images, actions_in, actions_out, action_length) # [NAV] question+cnn elif self.input_type in ['cnn', 'cnn+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() if self.to_cache == True: self.img_data_cache[index] = img_feats # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] actions_in = actions[:action_length] actions_out = actions[1:action_length + 1] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # if action_length is n # images.shape[0] is also n # actions[0] is <START> # actions[n] is <END> # grab 5 random frames # [NOTE]: this'll break for longer-than-5 navigation sequences start_idx = np.random.choice(img_feats.shape[0] + 1 - self.num_frames) img_feats = img_feats[start_idx:start_idx + self.num_frames] actions_in = actions[start_idx:start_idx + self.num_frames] actions_out = actions[start_idx + self.num_frames] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # [NAV] question+lstm elif self.input_type in ['lstm', 'lstm+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[ 0], :] = raw_img_feats.copy() if self.to_cache == True: self.img_data_cache[index] = img_feats actions_in = actions.clone() - 1 actions_out = actions[1:].clone() - 2 actions_in[action_length:].fill_(0) mask = actions_out.clone().gt(-1) if len(actions_out) > action_length: actions_out[action_length:].fill_(0) # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] return (idx, question, answer, False, actions_in, actions_out, action_length, mask) return (idx, question, answer, img_feats, actions_in, actions_out, action_length, mask) # [NAV] planner-controller elif self.input_type in ['pacman']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] planner_actions = self.planner_actions[index] controller_actions = self.controller_actions[index] planner_action_length = self.planner_action_lengths[index] controller_action_length = self.controller_action_lengths[index] planner_hidden_idx = self.planner_hidden_idx[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images) .cuda())).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[ 0], :] = raw_img_feats.copy() if self.to_cache == True: self.img_data_cache[index] = img_feats if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[self.env_list[index]].objects[ obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[self.env_list[ index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[self.env_list[index]].objects[ target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[self.env_list[ index]].objects[target_obj_id] return (idx, question, answer, actions, action_length) planner_pos_queue_idx = self.planner_pos_queue_idx[index] controller_pos_queue_idx = self.controller_pos_queue_idx[index] planner_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) planner_img_feats[:planner_action_length] = img_feats[ planner_pos_queue_idx] planner_actions_in = planner_actions.clone() - 1 planner_actions_out = planner_actions[1:].clone() - 2 planner_actions_in[planner_action_length:].fill_(0) planner_mask = planner_actions_out.clone().gt(-1) if len(planner_actions_out) > planner_action_length: planner_actions_out[planner_action_length:].fill_(0) controller_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) controller_img_feats[:controller_action_length] = img_feats[ controller_pos_queue_idx] controller_actions_in = actions[1:].clone() - 2 if len(controller_actions_in) > controller_action_length: controller_actions_in[controller_action_length:].fill_(0) controller_out = controller_actions controller_mask = controller_out.clone().gt(-1) if len(controller_out) > controller_action_length: controller_out[controller_action_length:].fill_(0) return (idx, question, answer, planner_img_feats, planner_actions_in, planner_actions_out, planner_action_length, planner_mask, controller_img_feats, controller_actions_in, planner_hidden_idx, controller_out, controller_action_length, controller_mask) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.available_idx)
class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, data_json=False, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, to_cache=False, target_obj_conn_map_dir=False, map_resolution=1000, overfit=False, max_controller_actions=5, max_actions=None): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) self.num_frames = num_frames self.max_controller_actions = max_controller_actions np.random.seed() self.data_json = data_json self.split = split self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.gpu_id = gpu_id self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.target_obj_conn_map_dir = target_obj_conn_map_dir self.map_resolution = map_resolution self.overfit = overfit self.to_cache = to_cache self.img_data_cache = {} print('Reading question data into memory') # self.idx -> Object ID self.idx = _dataset_to_tensor(questions_h5['idx']) self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['action_labels']) self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) self.cfg = load_config('../../House3D/tests/config.json') # Saty: max_actions is None! if max_actions: # max actions will allow us to create arrays of a certain length. Helpful if you only want to train with 10 actions. assert isinstance(max_actions, int) num_data_items = self.actions.shape[0] new_actions = np.zeros( (num_data_items, max_actions + 2), dtype=np.int64) #Saty: WHY +2? -> for <start> and <end> new_lengths = np.ones( (num_data_items, ), dtype=np.int64) * max_actions for i in range(num_data_items): action_length = int(self.action_lengths[i]) new_actions[i, 0] = 1 new_actions[i, 1:max_actions + 1] = self.actions[ i, action_length - max_actions:action_length].numpy() self.actions = torch.LongTensor(new_actions) self.action_lengths = torch.LongTensor(new_lengths) if self.data_json != False: data = json.load(open(self.data_json, 'r')) self.envs = data['envs'] #Satyen: Gold mine! self.env_idx = data[self.split + '_env_idx'] self.env_list = [self.envs[x] for x in self.env_idx] self.env_set = list(set(self.env_list)) self.env_set.sort() if self.overfit == True: self.env_idx = self.env_idx[:1] self.env_set = self.env_list = [ self.envs[x] for x in self.env_idx ] print('Trying to overfit to [house %s]' % self.env_set[0]) logging.info('Trying to overfit to [house {}]'.format( self.env_set[0])) print('Total envs: %d' % len(list(set(self.envs)))) print('Envs in %s: %d' % (self.split, len(list(set(self.env_idx))))) if input_type != 'ques': '''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' self.api_threads = [] self._load_envs(start_idx=0, in_order=True) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.to(self.device) self.pos_queue = data[self.split + '_pos_queue'] self.boxes = data[self.split + '_boxes'] if max_actions: for i in range(len(self.pos_queue)): self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:] if input_type == 'pacman': self.planner_actions = self.actions.clone().fill_(0) self.controller_actions = self.actions.clone().fill_(-1) self.planner_action_lengths = self.action_lengths.clone().fill_(0) self.controller_action_lengths = self.action_lengths.clone().fill_( 0) self.planner_hidden_idx = self.actions.clone().fill_(0) self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], [] # parsing flat actions to planner-controller hierarchy print(" Parsing flat actions to planner-controller hierarchy") for i in tqdm(range(len(self.actions))): pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=self.actions[i] [:self.action_lengths[i] + 1], # Saty: Take all actions essentially ; This doesn't have <start> and <end> controller_action_lim=max_controller_actions ) # saty: This is 5 self.planner_actions[i][:len(pa)] = torch.Tensor(pa) self.controller_actions[i][:len(ca)] = torch.Tensor(ca) self.planner_action_lengths[i] = len(pa) - 1 self.controller_action_lengths[i] = len(ca) self.planner_pos_queue_idx.append(pq_idx) self.controller_pos_queue_idx.append( cq_idx) # Saty: This is just [1,2,3,4/.. , len(actions)] self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx) def _pick_envs_to_load(self, split='train', max_envs=10, start_idx=0, in_order=False): if split in ['val', 'test'] or in_order == True: pruned_env_set = self.env_set[start_idx:start_idx + max_envs] else: if max_envs < len(self.env_set): env_inds = np.random.choice(len(self.env_set), max_envs, replace=False) else: env_inds = np.random.choice(len(self.env_set), max_envs, replace=True) pruned_env_set = [self.env_set[x] for x in env_inds] return pruned_env_set def _load_envs(self, start_idx=-1, in_order=False): self._clear_api_threads() self._clear_memory() if start_idx == -1: start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1 # Pick envs self.pruned_env_set = self._pick_envs_to_load( split=self.split, max_envs=self.max_threads_per_gpu, start_idx=start_idx, in_order=in_order) if len(self.pruned_env_set) == 0: return # Load api threads #self._clear_api_threads() start = time.time() if len(self.api_threads) == 0: for i in range(self.max_threads_per_gpu): api_temp = None api_temp = objrender.RenderAPIThread(w=224, h=224, device=self.gpu_id) self.api_threads.append(api_temp) #try: # self.cfg = load_config('/home/satyen/GitHub_repos/our_EQA/House3D/tests/config.json') # self.cfg = load_config('../House3D/tests/config.json') #except: # self.cfg = load_config('../../House3D/tests/config.json') print('[%.02f] Loaded %d api threads' % (time.time() - start, len(self.api_threads))) start = time.time() # Load houses from multiprocessing import Pool _args = ([h, self.cfg, self.map_resolution] for h in self.pruned_env_set) with Pool(len(self.pruned_env_set)) as pool: self.all_houses = pool.starmap(local_create_house, _args) print('[%.02f] Loaded %d houses' % (time.time() - start, len(self.all_houses))) start = time.time() # Load envs self.env_loaded = {} for i in range(len(self.all_houses)): print('[%02d/%d][split:%s][gpu:%d][house:%s]' % (i + 1, len(self.all_houses), self.split, self.gpu_id, self.all_houses[i].house['id'])) environment = Environment(self.api_threads[i], self.all_houses[i], self.cfg) self.env_loaded[self.all_houses[i].house['id']] = House3DUtils( environment, target_obj_conn_map_dir=self.target_obj_conn_map_dir, build_graph=False) # [TODO] Unused till now self.env_ptr = -1 print('[%.02f] Loaded %d house3d envs' % (time.time() - start, len(self.env_loaded))) # CM: has to be 770 # Mark available data indices self.available_idx = [ i for i, v in enumerate(self.env_list) if v in self.env_loaded ] # [TODO] only keeping legit sequences # needed for things to play well with old data temp_available_idx = self.available_idx.copy() for i in range(len(temp_available_idx)): if self.action_lengths[temp_available_idx[i]] < 5: self.available_idx.remove(temp_available_idx[i]) print('Available inds: %d' % len(self.available_idx)) # Flag to check if loaded envs have been cycled through or not # [TODO] Unused till now self.all_envs_loaded = False def _clear_api_threads(self): for i in range(len(self.api_threads)): del self.api_threads[0] self.api_threads = [] def _clear_memory(self): if hasattr(self, 'episode_house'): del self.episode_house if hasattr(self, 'env_loaded'): del self.env_loaded #if hasattr(self, 'api_threads'): # del self.api_threads #self.api_threads = [] def _check_if_all_envs_loaded(self): print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache), len(self.env_list))) if len(self.img_data_cache) == len(self.env_list): self.available_idx = [i for i, v in enumerate(self.env_list)] return True else: return False def set_camera(self, e, pos, robot_height=1.0): assert len(pos) == 4 e.env.cam.pos.x = pos[0] e.env.cam.pos.y = robot_height e.env.cam.pos.z = pos[2] e.env.cam.yaw = pos[3] e.env.cam.updateDirection() def render(self, e): return e.env.render() def get_frames(self, e, pos_queue, preprocess=True): if isinstance(pos_queue, list) == False: pos_queue = [pos_queue] res = [] for i in range(len(pos_queue)): self.set_camera(e, pos_queue[i]) img = np.array(self.render(e), copy=False, dtype=np.float32) if preprocess == True: img = img.transpose(2, 0, 1) img = img / 255.0 res.append(img) return np.array(res) # Confused about this function! def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0, max_controller_actions=5): action_length = len(actions) - 1 pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions( actions=actions, controller_action_lim=max_controller_actions) # count how many actions of same type have been encountered pefore starting navigation # Not used in train_eval -> train() backtrack_controller_steps = actions[1:action_length - backtrack_steps + 1:][::-1] counter = 0 try: if len(backtrack_controller_steps) > 0: while (counter <= self.max_controller_actions) and ( counter < len(backtrack_controller_steps)) and ( backtrack_controller_steps[counter] == backtrack_controller_steps[0]): counter += 1 except: import pdb pdb.set_trace( ) #If you have breakpoint here, you probably found an error in the logit above to figure out the correct counter step. Still working on this and checking. ##################################################################################### target_pos_idx = action_length - backtrack_steps controller_step = True if target_pos_idx in pq_idx: controller_step = False pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx] pa_pruned = pa[:len(pq_idx_pruned) + 1] images = self.get_frames(self.episode_house, self.episode_pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images).to( self.device))).data.cpu().numpy().copy() controller_img_feat = torch.from_numpy( raw_img_feats[target_pos_idx].copy()) # Last action taken by the planner! controller_action_in = pa_pruned[-1] - 2 planner_img_feats = torch.from_numpy( raw_img_feats[pq_idx_pruned].copy()) planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1) return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \ controller_img_feat, self.episode_pos_queue[target_pos_idx], counter def __getitem__(self, index): # [VQA] question-only if self.input_type == 'ques': idx = self.idx[index] question = self.questions[index] answer = self.answers[index] return (idx, question, answer) # [VQA] question+image elif self.input_type == 'ques,image': index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1:action_length + 1] if self.to_cache == True and index in self.img_data_cache: images = self.img_data_cache[index] else: pos_queue = self.pos_queue[index][ -self.num_frames:] # last 5 frames images = self.get_frames(self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) if self.to_cache == True: self.img_data_cache[index] = images.copy() return (idx, question, answer, images, actions_in, actions_out, action_length) # [NAV] question+cnn elif self.input_type in ['cnn', 'cnn+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames(self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) img_feats = self.cnn( Variable(torch.FloatTensor(images).to( self.device))).data.cpu().numpy().copy() if self.to_cache == True: self.img_data_cache[index] = img_feats # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] actions_in = actions[:action_length] actions_out = actions[1:action_length + 1] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # if action_length is n # images.shape[0] is also n # actions[0] is <START> # actions[n] is <END> # grab 5 random frames # [NOTE]: this'll break for longer-than-5 navigation sequences start_idx = np.random.choice(img_feats.shape[0] + 1 - self.num_frames) img_feats = img_feats[start_idx:start_idx + self.num_frames] actions_in = actions[start_idx:start_idx + self.num_frames] actions_out = actions[start_idx + self.num_frames] - 2 return (idx, question, answer, img_feats, actions_in, actions_out, action_length) # [NAV] question+lstm elif self.input_type in ['lstm', 'lstm+q']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images).to( self.device))).data.cpu().numpy().copy() img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy( ) if self.to_cache == True: self.img_data_cache[index] = img_feats actions_in = actions.clone() - 1 actions_out = actions[1:].clone() - 2 actions_in[action_length:].fill_(0) mask = actions_out.clone().gt(-1) if len(actions_out) > action_length: actions_out[action_length:].fill_(0) # for val or test (evaluation), or # when target_obj_conn_map_dir is defined (reinforce), # load entire shortest path navigation trajectory # and load connectivity map for intermediate rewards if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0]['box'] for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \ all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True: target_obj_id = obj_id break bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \ all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]): target_room = room break assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] return (idx, question, answer, False, actions_in, actions_out, action_length, mask) return (idx, question, answer, img_feats, actions_in, actions_out, action_length, mask) # [NAV] planner-controller elif self.input_type in ['pacman']: index = self.available_idx[index] idx = self.idx[index] question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] planner_actions = self.planner_actions[index] controller_actions = self.controller_actions[index] planner_action_length = self.planner_action_lengths[index] controller_action_length = self.controller_action_lengths[index] planner_hidden_idx = self.planner_hidden_idx[index] if self.split == 'train': if self.to_cache == True and index in self.img_data_cache: img_feats = self.img_data_cache[index] else: pos_queue = self.pos_queue[index] images = self.get_frames( self.env_loaded[self.env_list[index]], pos_queue, preprocess=True) raw_img_feats = self.cnn( Variable(torch.FloatTensor(images).to( self.device))).data.cpu().numpy().copy() # Saty: Actions or which there are no image features? # Raw img_feats.shape[1] = 3200 img_feats = np.zeros( (self.actions.shape[1], raw_img_feats.shape[1]), dtype=np.float32) img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy( ) if self.to_cache == True: self.img_data_cache[index] = img_feats # LOL! Goes into this- since we're giving target+obj_conn_map_dir if self.split in ['val', 'test' ] or self.target_obj_conn_map_dir != False: target_obj_id, target_room = False, False min_ = 1 bbox_obj = [ x for x in self.boxes[index] if x['type'] == 'object' and x['target'] == True ][0] trg_obj_name = bbox_obj['name'] bbox_obj = bbox_obj['box'] bbox_obj_min = np.array([bbox_obj['min'][x] for x in range(3)]) bbox_obj_max = np.array([bbox_obj['max'][x] for x in range(3)]) min_print = 0 max_print = 0 # print("target obj bbox:", bbox_obj_min, bbox_obj_max) for obj_id in self.env_loaded[self.env_list[index]].objects: box2 = self.env_loaded[ self.env_list[index]].objects[obj_id]['bbox'] ############################ SATYEN ###################################### # print("BBOX_OBJ:", [bbox_obj['min'][x] for x in range(3)]) # print("BOX2 min:", [box2['min'][x] for x in range(3)]) # print("BBOX_OBJ:max", [bbox_obj['max'][x] for x in range(3)]) # print("Box2:max", [box2['max'][x] for x in range(3)]) box2_min = np.array([box2['min'][x] for x in range(3)]) box2_max = np.array([box2['max'][x] for x in range(3)]) diff_min = np.mean(abs(bbox_obj_min - box2_min)) diff_max = np.mean(abs(bbox_obj_max - box2_max)) if abs(diff_min + diff_max) / 2 < min_: min_ = (diff_min + diff_max) / 2 target_obj_id = obj_id min_print = box2_min max_print = box2_max #obj_iter_id = self.env_loaded[self.env_list[index]].objects[obj_id]['id'] #sys.exit() # target_obj_id = obj_id # break ############################################################################ #print("targetObj:{}".format(trg_obj_name)) #print("Target obj_iter",target_obj_id) #print("env obj bbox", min_print, max_print) ### Satyen: TARGET ROOM #################3333 bbox_room = [ x for x in self.boxes[index] if x['type'] == 'room' and x['target'] == False ][0] min_ = 1 bbox_room_min = np.array( [bbox_room['box']['min'][x] for x in range(3)]) bbox_room_max = np.array( [bbox_room['box']['max'][x] for x in range(3)]) for room in self.env_loaded[ self.env_list[index]].env.house.all_rooms: ################ SATYEN ############################################### # print("Room min", [room['bbox']['min'][x] for x in range(3)]) # print("BBox min", [bbox_room['box']['min'][x] for x in range(3)]) # print("Room max", [room['bbox']['max'][x] for x in range(3)]) # print("BBox max", [bbox_room['box']['max'][x] for x in range(3)]) # print([ all(math.isclose(bbox_obj['max'][x], box2['max'][x], abs_tol=1e-6) for x in range(3) )]) room_min = np.array( [room['bbox']['min'][x] for x in range(3)]) room_max = np.array( [room['bbox']['max'][x] for x in range(3)]) #bbox_room_min = np.array([bbox_room['box']['min'][x] for x in range(3)]) #bbox_room_max = np.array([bbox_room['box']['max'][x] for x in range(3)]) diff_min = np.mean(abs(room_min - bbox_room_min)) diff_max = np.mean(abs(room_max - bbox_room_max)) if abs(diff_min + diff_max) / 2 < min_: min_ = (diff_min + diff_max) / 2 target_room = room #elif min_ == 1: # target_room = room #if all([math.isclose(room['bbox']['min'][x], bbox_room['box']['min'][x], abs_tol = 0.6) for x in range(3)]) == True and \ # all([math.isclose(room['bbox']['max'][x], bbox_room['box']['max'][x], abs_tol = 0.6) for x in range(3)]) == True: # target_room = room # break ######################################################################### assert target_obj_id != False assert target_room != False self.env_loaded[self.env_list[index]].set_target_object( self.env_loaded[ self.env_list[index]].objects[target_obj_id], target_room) # [NOTE] only works for batch size = 1 self.episode_pos_queue = self.pos_queue[index] self.episode_house = self.env_loaded[self.env_list[index]] self.target_room = target_room self.target_obj = self.env_loaded[ self.env_list[index]].objects[target_obj_id] #print("Target OBJ!!:from env ", self.target_obj) return (idx, question, answer, actions, action_length) planner_pos_queue_idx = self.planner_pos_queue_idx[index] controller_pos_queue_idx = self.controller_pos_queue_idx[index] # Saty: Get img_feats only for the places wherePLNR makes a prediction. Stored in planner_pos_queue_idx planner_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) planner_img_feats[:planner_action_length] = img_feats[ planner_pos_queue_idx] planner_actions_in = planner_actions.clone( ) - 1 # planner actions \in [0,4] # WHY[0,4]? Where forward = 1? planner_actions_in[planner_action_length:].fill_( 0) # mask the elements after planner action length = 0 planner_actions_out = planner_actions[1:].clone( ) - 2 # planner actions_out \in [-1.3] # -1 -> START 0-> forward.. etc; ^ALSO! Shifted by -> Makes sense planner_mask = planner_actions_out.clone().gt( -1) # gt -> Greater than! :| if len(planner_actions_out) > planner_action_length: planner_actions_out[planner_action_length:].fill_(0) controller_img_feats = np.zeros( (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32) controller_img_feats[:controller_action_length] = img_feats[ controller_pos_queue_idx] controller_actions_in = actions[1:].clone( ) - 2 # passed into the controller itself! if len(controller_actions_in) > controller_action_length: controller_actions_in[controller_action_length:].fill_(0) controller_out = controller_actions controller_mask = controller_out.clone().gt(-1) if len(controller_out) > controller_action_length: controller_out[controller_action_length:].fill_(0) # zero out forced controller return for i in range(controller_action_length): if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \ (self.max_controller_actions == 1 or controller_out[i - self.max_controller_actions + 1:i].sum() == self.max_controller_actions - 1): controller_mask[i] = 0 return (idx, question, answer, planner_img_feats, planner_actions_in, planner_actions_out, planner_action_length, planner_mask, controller_img_feats, controller_actions_in, planner_hidden_idx, controller_out, controller_action_length, controller_mask) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.available_idx)
class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) np.random.seed() self.split = split self.gpu_id = gpu_id self.num_frames = num_frames self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.map_resolution = map_resolution print('Reading question data into memory') self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['actions']) self.actions = self.actions.unsqueeze(2) self.robot_positions = _dataset_to_tensor( questions_h5['robot_positions'], dtype=np.float32) self.action_images = questions_h5['images'] self.action_lengths = _dataset_to_tensor( questions_h5['action_lengths']) self.action_masks = _dataset_to_tensor(questions_h5['mask']) #if input_type != 'ques': ''' If training, randomly sample and load a subset of environments, train on those, and then cycle through to load the rest. On the validation and test set, load in order, and cycle through. For both, add optional caching so that if all environments have been cycled through once, then no need to re-load and instead, just the cache can be used. ''' cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() def __getitem__(self, index): # [VQA] question-only if self.input_type in ['pacman']: idx = index question = self.questions[index] #answer = self.answers[index] answer = self.answers[index] actions = self.actions[index] actions_masks = self.action_masks[index] robot_positions = self.robot_positions[index] action_lengths = self.action_lengths[index] if self.split in ['val', 'test']: #return the data directly return (idx, question, answer, actions, robot_positions, action_lengths) if self.split == 'train': #get iamge from data_set planner_images = self.action_images[index] planner_img_feats = self.cnn( Variable(torch.FloatTensor( planner_images).cuda())).data.cpu().numpy().copy() actions_in = actions.clone() actions_out = actions[1:].clone() actions_masks = actions_masks[:39].clone().gt(0) robot_positions = robot_positions.clone() return (idx, question, answer, planner_img_feats, actions_in, actions_out, robot_positions, actions_masks, action_lengths) elif self.input_type == 'ques,image': idx = index question = self.questions[index] answer = self.answers[index] action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1:action_length + 1] images = self.action_images[index][ action_length - self.num_frames:action_length].astype( np.float32) return (idx, question, answer, images, actions_in, actions_out, action_length) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.questions)
def test(rank, test_model_dir): model_kwargs = {'question_vocab': load_vocab(args.vocab_json)} model = NavPlannerControllerModel(**model_kwargs) checkpoint = torch.load(test_model_dir) #load check point model.load_state_dict(checkpoint['state']) #create model cnn_kwargs = {'num_classes': 191, 'pretrained': True} cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() cnn.cuda() #create cnn model scene = "test-10-obj-00.txt" my_env = enviroment.Environment(is_testing=1, testing_file=scene) object_exist_list = my_env.ur5.object_type print("the objetct which is exist:") print(object_exist_list) #create simulation enviroment my_question = Qusetion(object_exist_list) #create testing question testing_questions = my_question.createQueue() vocab = my_question.create_vocab() for question in testing_questions: planner_hidden = None max_action = 30 position = [0, 0] action_in_raw = [0] #start action_in actions = [] print(question['question']) #question questionTokens = my_question.tokenize(question['question'], punctToRemove=['?'], addStartToken=False) encoded_question_raw = my_question.encode(questionTokens, vocab['questionTokenToIdx']) encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) action_times = 0 while (action_times < max_action): #print(planner_img_feats_var.size()) action_in_tensor = _dataset_to_tensor(action_in_raw) action_in = Variable(action_in_tensor) action_in = action_in.unsqueeze(0) action_in = action_in.unsqueeze(0) _, rgb_image_raw = my_env.camera.get_camera_data() position_in, planner_img_feats_var = data2input( position, rgb_image_raw, cnn) output_data, planner_hidden = model.planner_step( encoded_question, planner_img_feats_var, action_in, position_in, planner_hidden) planner_possi = F.log_softmax(output_data, dim=1) planner_data = planner_possi.data.numpy() planner_data = planner_data[0] action_out = np.where(planner_data == np.max(planner_data)) action_out = action_out[0][0] actions.append(action_out) action_in_raw = [action_out] if action_out == 9: print('stop') break else: dx, dy = order2action(action_out) position[0] += dx position[1] += dy action_times += 1 if len(actions) > 2 and len(actions) < 20: action_position = position + position my_env.UR5_action(action_position, 2) #sucking elif len(actions) >= 20: #pushing position_start = [0, 0] position_end = [0, 0] for i in range(len(actions)): if i < len(actions) / 2: #the first step dx, dy = order2action(actions[i]) position_start[0] += dx position_start[1] += dy position_end[0] += dx position_end[1] += dy else: #the second step dx, dy = order2action(actions[i]) position_end[0] += dx position_end[1] += dy action_position = position_start + position_end my_env.UR5_action(action_position, 1) #pushing
def test(rank): cnn_model_dir = os.path.abspath("../train/models/03_13_h3d_hybrid_cnn.pt") vqa_model_kwargs = { 'vocab': load_vocab(args.vocab_json), 'checkpoint_path': cnn_model_dir } vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs) vqa_checkpoint = torch.load(args.vqa_weight) #load checkpoint weights vqa_model.load_state_dict(vqa_checkpoint['state']) print('--- vqa_model loaded checkpoint ---') res_model_dir = os.path.abspath("../train/models/resnet101.pth") my_map_cnn = mapCNN(checkpoint_path=res_model_dir) map_checkpoint = torch.load('mapcnn.pt', map_location='cpu') #load checkpoint weights my_map_cnn.load_state_dict(map_checkpoint['state']) #create map model print('--- map_model loaded checkpoint ---') cnn_kwargs = { 'num_classes': 191, 'pretrained': True, 'checkpoint_path': cnn_model_dir } cnn = MultitaskCNN(**cnn_kwargs) cnn.eval() vocab_dir = os.path.abspath("vocab.json") vocab_file = open(vocab_dir, 'r', encoding='utf-8') vocab = json.load(vocab_file) question = args.question print(question) questionTokens = tokenize(question, punctToRemove=['?'], addStartToken=False) encoded_question_raw = encode(questionTokens, vocab['questionTokenToIdx']) while (len(encoded_question_raw) < 10): encoded_question_raw.append(0) #encode question encoded_question_raw = np.array(encoded_question_raw) encoded_question_tensor = _dataset_to_tensor(encoded_question_raw) encoded_question = Variable(encoded_question_tensor) encoded_question = encoded_question.unsqueeze(0) rgb_before = cv.imread(args.rgb_image_before_dir) rgb_after = cv.imread(args.rgb_image_after_dir) depth_after = cv.imread(args.depth_image_after_dir) depth_after = depth_after[0] depth_dim = depth_after.shape print(depth_dim) rgb_after_resize = cv.resize(rgb_after, (256, 256), interpolation=cv.INTER_AREA) # crop and add marking depth_after_resize = cv.resize(depth_after, (256, 256), interpolation=cv.INTER_AREA) # crop and add marking rgb_tensor, depth_tensor = rgbd2tensor(rgb_after_resize, depth_after_resize) #output_heatmap heatmap_output = rgbd2heatmap(rgb_tensor, depth_tensor, my_map_cnn) f = h5py.File(args.heatmap_output_dir, 'w') f['heatmap'] = heatmap_output cv.imwrite(args.rgb_image_after_dir, rgb_after_resize) cv.imwrite(args.depth_image_after_dir, depth_after_resize) before_image_feat = data2input(rgb_before) after_image_feat = data2input(rgb_after_resize) input_image = [before_image_feat, after_image_feat] input_image_feats = Variable(torch.FloatTensor(input_image)) input_image_feats = input_image_feats.view(1, 2, 3, 224, 224) # print(input_image_feats.size()) #print(input_image.size()) #print(before_image_feat.size()) scores, _ = vqa_model(input_image_feats, encoded_question) scores = scores.data.numpy() scores = scores[0] answer_predict = np.where(scores == np.max(scores)) answer_predict = answer_predict[0][0] answer_dic = vocab["answerTokenToIdx"] answer = [k for k, v in answer_dic.items() if v == answer_predict] print(answer[0])
class EqaDataset(Dataset): def __init__(self, questions_h5, vocab, num_frames=1, split='train', gpu_id=0, input_type='ques', max_threads_per_gpu=10, map_resolution=1000): self.questions_h5 = questions_h5 self.vocab = load_vocab(vocab) np.random.seed() self.split = split self.gpu_id = gpu_id self.num_frames = num_frames self.input_type = input_type self.max_threads_per_gpu = max_threads_per_gpu self.map_resolution = map_resolution print('Reading question data into memory') self.questions = _dataset_to_tensor(questions_h5['questions']) self.answers = _dataset_to_tensor(questions_h5['answers']) self.actions = _dataset_to_tensor(questions_h5['actions']) self.actions = self.actions.unsqueeze(2) self.robot_positions = _dataset_to_tensor(questions_h5['robot_positions'],dtype = np.float32) self.action_images = questions_h5['images'] self.action_maps = questions_h5['heatmaps'] self.action_lengths = _dataset_to_tensor(questions_h5['action_lengths']) self.action_masks = _dataset_to_tensor(questions_h5['mask']) cnn_kwargs = {'num_classes': 191, 'pretrained': True} self.cnn = MultitaskCNN(**cnn_kwargs) self.cnn.eval() self.cnn.cuda() def __getitem__(self, index): # [VQA] question-only if self.input_type in ['nomap']: idx = index question = self.questions[index] #answer = self.answers[index] answer = self.answers[index] if answer > 13: answer = answer - 1 actions = self.actions[index] actions_masks = self.action_masks[index] robot_positions = self.robot_positions[index] action_lengths = self.action_lengths[index] if self.split in ['val', 'test']: #return the data directly return (idx, question, answer, actions, robot_positions,action_lengths) if self.split == 'train': #get iamge from data_set planner_images = self.action_images[index][0] planner_var = Variable(torch.FloatTensor(planner_images) .cuda()) planner_var = planner_var.unsqueeze(0) planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy() actions_in = actions.clone() actions_out = actions[1:].clone() actions_masks = actions_masks[:39].clone().gt(0) robot_positions = robot_positions.clone() return (idx, question, answer, planner_img_feats, actions_in, actions_out, robot_positions, actions_masks,action_lengths) elif self.input_type == 'addmap': idx = index question = self.questions[index] #answer = self.answers[index] answer = self.answers[index] if answer > 13: answer = answer - 1 actions = self.actions[index] actions_masks = self.action_masks[index] robot_positions = self.robot_positions[index] action_lengths = self.action_lengths[index] if self.split in ['val', 'test']: #return the data directly return (idx, question, answer, actions, robot_positions,action_lengths) if self.split == 'train': #get iamge from data_set planner_images = self.action_images[index][0] planner_var = Variable(torch.FloatTensor(planner_images) .cuda()) planner_var = planner_var.unsqueeze(0) planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy() planner_maps = self.action_maps[index][0] planner_maps_feats = Variable(torch.FloatTensor(planner_maps) .cuda()) #planner_maps_feats = planner_maps_var.view(-1,32*32*20) actions_in = actions.clone() actions_out = actions[1:].clone() actions_masks = actions_masks[:39].clone().gt(0) robot_positions = robot_positions.clone() return (idx, question, answer, planner_img_feats, planner_maps_feats,actions_in, actions_out, robot_positions, actions_masks,action_lengths) elif self.input_type == 'ques,image': idx = index question = self.questions[index] answer = self.answers[index] if answer > 13: answer = answer - 1 action_length = self.action_lengths[index] actions = self.actions[index] actions_in = actions[action_length - self.num_frames:action_length] actions_out = actions[action_length - self.num_frames + 1: action_length + 1] images = self.action_images[index][0:2].astype(np.float32) return (idx, question, answer, images, actions_in, actions_out, action_length) def __len__(self): if self.input_type == 'ques': return len(self.questions) else: return len(self.questions)