Ejemplo n.º 1
0
def test(rank):
    nav_model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
    nav_model = NavPlannerControllerModel(**nav_model_kwargs)
    nav_checkpoint = torch.load(args.nav_weight)  #load checkpoint weights
    nav_model.load_state_dict(nav_checkpoint['state'])  #create model
    print('--- nav_model loaded checkpoint ---')

    cnn_kwargs = {'num_classes': 191, 'pretrained': True}
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()
    cnn.cuda()  #create cnn model

    vqa_model_kwargs = {'vocab': load_vocab(args.vocab_json)}
    vqa_model = VqaLstmCnnAttentionModel(**vqa_model_kwargs)
    vqa_checkpoint = torch.load(args.vqa_weight)  #load checkpoint weights
    vqa_model.load_state_dict(vqa_checkpoint['state'])
    print('--- vqa_model loaded checkpoint ---')

    # need cnn?

    scene = "test-10-obj-100.txt"
    my_env = enviroment.Environment(is_testing=0, testing_file=scene)
    object_exist_list = my_env.ur5.object_type
    print("Objetcts that exist: ")
    print(object_exist_list)  #create simulation enviroment

    my_question = Qusetion(object_exist_list)  #create testing question
    testing_questions = my_question.createQueue()
    vocab = my_question.create_vocab()

    for question in testing_questions:
        planner_hidden = None
        max_action = 30
        position = [0, 0]
        action_in_raw = [0]  #start action_in
        actions = []

        print(question['question'])  #question
        questionTokens = my_question.tokenize(question['question'],
                                              punctToRemove=['?'],
                                              addStartToken=False)
        encoded_question_raw = my_question.encode(questionTokens,
                                                  vocab['questionTokenToIdx'])
        encoded_question_raw.append(0)  #encode question
        encoded_question_raw = np.array(encoded_question_raw)
        encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
        encoded_question = Variable(encoded_question_tensor)
        encoded_question = encoded_question.unsqueeze(0)
        print(encoded_question)
        action_times = 0
        push_signal = 0
        push_point = 0

        while (action_times < max_action):

            #print(planner_img_feats_var.size())
            action_in_tensor = _dataset_to_tensor(action_in_raw)
            action_in = Variable(action_in_tensor)
            action_in = action_in.unsqueeze(0)
            action_in = action_in.unsqueeze(0)

            _, rgb_image_raw = my_env.camera.get_camera_data()  #before
            position_in, planner_img_feats_var = data2input(
                position, rgb_image_raw, cnn)

            output_data, planner_hidden = nav_model.planner_step(
                encoded_question, planner_img_feats_var, action_in,
                position_in, planner_hidden)
            planner_possi = F.log_softmax(output_data, dim=1)
            planner_data = planner_possi.data.numpy()
            planner_data = planner_data[0]
            action_out = np.where(planner_data == np.max(planner_data))
            action_out = action_out[0][0]

            actions.append(action_out)
            action_in_raw = [action_out]
            if action_out == 9:
                print('stop')
                break
            elif action_out == 0:
                push_signal = 1
                push_point = action_times
            else:
                dx, dy = order2action(action_out)
                position[0] += dx
                position[1] += dy
            action_times += 1

        if len(actions) > 2 and push_signal == 0:
            action_position = position + position
            my_env.UR5_action(action_position, 2)  #sucking
        elif len(actions) > 2 and push_signal == 1:  #pushing
            position_start = [0, 0]
            position_end = [0, 0]
            for i in range(len(actions)):
                if i <= push_point:  #the first step
                    dx, dy = order2action(actions[i])
                    position_start[0] += dx
                    position_start[1] += dy
                    position_end[0] += dx
                    position_end[1] += dy
                else:  #the second step
                    dx, dy = order2action(actions[i])
                    position_end[0] += dx
                    position_end[1] += dy
            action_position = position_start + position_end
            my_env.UR5_action(action_position, 1)  #pushing

        # get image after actions
        _, rgb_image_after = my_env.camera.get_camera_data(
        )  # image after actions
        shrink = cv.resize(rgb_image_raw, (224, 224),
                           interpolation=cv.INTER_AREA)
        shrink = np.array(shrink)
        shrink = shrink.transpose((2, 0, 1))
        shrink = shrink.reshape(1, 3, 224, 224)
        shrink = (shrink / 255.0).astype(np.float32)
        images = torch.FloatTensor(shrink)
        images = Variable(images)
        images = images.unsqueeze(0)

        # process images

        # answer question in vqa now
        # encoded_question already done

        scores, _ = vqa_model(images, encoded_question)
        scores = scores.data.numpy()
        scores = scores[0]
        answer_predict = np.where(scores == np.max(scores))
        answer_predict = answer_predict[0][0]
        if answer_predict == 0:
            print('--- Predict: Exists not')
        elif answer_predict == 1:
            print('--- Predict: Exists')
        else:
            raise Exception('Prediction neither 0 nor 1')
Ejemplo n.º 2
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 data_json=False,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 to_cache=False,
                 target_obj_conn_map_dir=False,
                 map_resolution=1000,
                 overfit=False,
                 max_controller_actions=5,
                 max_actions=None):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        self.num_frames = num_frames
        self.max_controller_actions = max_controller_actions

        np.random.seed()

        self.data_json = data_json
        self.split = split
        self.gpu_id = gpu_id

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu

        self.target_obj_conn_map_dir = target_obj_conn_map_dir
        self.map_resolution = map_resolution
        self.overfit = overfit

        self.to_cache = to_cache
        self.img_data_cache = {}

        print('Reading question data into memory from', questions_h5)
        self.idx = _dataset_to_tensor(questions_h5['idx'])
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['action_labels'])
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])
        print('... finished running dataset_to_tensor operations from',
              questions_h5)

        if max_actions:  #max actions will allow us to create arrays of a certain length.  Helpful if you only want to train with 10 actions.
            print('... entering max_actions conditions block from',
                  questions_h5)
            assert isinstance(max_actions, int)
            num_data_items = self.actions.shape[0]
            new_actions = np.zeros((num_data_items, max_actions + 2),
                                   dtype=np.int64)
            new_lengths = np.ones(
                (num_data_items, ), dtype=np.int64) * max_actions
            for i in range(num_data_items):
                action_length = int(self.action_lengths[i])
                new_actions[i, 0] = 1
                new_actions[i, 1:max_actions + 1] = self.actions[
                    i, action_length - max_actions:action_length].numpy()
            self.actions = torch.LongTensor(new_actions)
            self.action_lengths = torch.LongTensor(new_lengths)

            print('... finished running max_actions conditions block from',
                  questions_h5)

        if self.data_json != False:
            print('... entering data_json false condition block from',
                  questions_h5)
            data = json.load(open(self.data_json, 'r'))
            self.envs = data['envs']

            self.env_idx = data[self.split + '_env_idx']
            self.env_list = [self.envs[x] for x in self.env_idx]
            self.env_set = list(set(self.env_list))
            self.env_set.sort()

            if self.overfit:
                self.env_idx = self.env_idx[:1]
                self.env_set = self.env_list = [
                    self.envs[x] for x in self.env_idx
                ]
                print('Trying to overfit to [house %s]' % self.env_set[0])
                logging.info('Trying to overfit to [house {}]'.format(
                    self.env_set[0]))

            print(questions_h5, 'Total envs: %d' % len(list(set(self.envs))))
            print(
                questions_h5,
                'Envs in %s: %d' % (self.split, len(list(set(self.env_idx)))))

            if input_type != 'ques':
                ''''
                If training, randomly sample and load a subset of environments,
                train on those, and then cycle through to load the rest.

                On the validation and test set, load in order, and cycle through.

                For both, add optional caching so that if all environments
                have been cycled through once, then no need to re-load and
                instead, just the cache can be used.
                '''

                self.api_threads = []
                self._load_envs(start_idx=0, in_order=True)

                cnn_kwargs = {'num_classes': 191, 'pretrained': True}
                self.cnn = MultitaskCNN(**cnn_kwargs)
                self.cnn.eval()
                self.cnn.cuda()

            self.pos_queue = data[self.split + '_pos_queue']
            self.boxes = data[self.split + '_boxes']

            if max_actions:
                for i in range(len(self.pos_queue)):
                    self.pos_queue[i] = self.pos_queue[i][-1 * max_actions:]

            print('... finished running data_json false condition block from',
                  questions_h5)

        if input_type == 'pacman':
            print('... entering input_type pacman condition block from',
                  questions_h5)

            self.planner_actions = self.actions.clone().fill_(0)
            self.controller_actions = self.actions.clone().fill_(-1)

            self.planner_action_lengths = self.action_lengths.clone().fill_(0)
            self.controller_action_lengths = self.action_lengths.clone().fill_(
                0)

            self.planner_hidden_idx = self.actions.clone().fill_(0)

            self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], []

            # parsing flat actions to planner-controller hierarchy
            for i in tqdm(range(len(self.actions))):

                pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
                    actions=self.actions[i][:self.action_lengths[i] + 1],
                    controller_action_lim=max_controller_actions)

                self.planner_actions[i][:len(pa)] = torch.Tensor(pa)
                self.controller_actions[i][:len(ca)] = torch.Tensor(ca)

                self.planner_action_lengths[i] = len(pa) - 1
                self.controller_action_lengths[i] = len(ca)

                self.planner_pos_queue_idx.append(pq_idx)
                self.controller_pos_queue_idx.append(cq_idx)

                self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)

            print(
                '... finished running input_type pacman condition block from',
                questions_h5)

        print('... finished instantiating EqaDataset from', questions_h5)

    def _pick_envs_to_load(self,
                           split='train',
                           max_envs=10,
                           start_idx=0,
                           in_order=False):
        if split in ['val', 'test'] or in_order == True:
            pruned_env_set = self.env_set[start_idx:start_idx + max_envs]
        else:
            if max_envs < len(self.env_set):
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=False)
            else:
                env_inds = np.random.choice(len(self.env_set),
                                            max_envs,
                                            replace=True)
            pruned_env_set = [self.env_set[x] for x in env_inds]
        return pruned_env_set

    def _load_envs(self, start_idx=-1, in_order=False):
        #self._clear_memory()
        if start_idx == -1:
            start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1

        # Pick envs
        self.pruned_env_set = self._pick_envs_to_load(
            split=self.split,
            max_envs=self.max_threads_per_gpu,
            start_idx=start_idx,
            in_order=in_order)

        if len(self.pruned_env_set) == 0:
            return

        # Load api threads
        start = time.time()
        if len(self.api_threads) == 0:
            for i in range(self.max_threads_per_gpu):
                self.api_threads.append(
                    objrender.RenderAPIThread(w=224, h=224,
                                              device=self.gpu_id))

        self.cfg = load_config('../../House3D/tests/config.json')

        print('[%.02f] Loaded %d api threads' %
              (time.time() - start, len(self.api_threads)))
        start = time.time()

        # Load houses
        from multiprocessing import Pool
        _args = ([h, self.cfg, self.map_resolution]
                 for h in self.pruned_env_set)
        with Pool(len(self.pruned_env_set)) as pool:
            self.all_houses = pool.starmap(local_create_house, _args)

        print('[%.02f] Loaded %d houses' %
              (time.time() - start, len(self.all_houses)))
        start = time.time()

        # Load envs
        self.env_loaded = {}
        for i in range(len(self.all_houses)):
            print('[%02d/%d][split:%s][gpu:%d][house:%s]' %
                  (i + 1, len(self.all_houses), self.split, self.gpu_id,
                   self.all_houses[i].house['id']))
            environment = Environment(self.api_threads[i], self.all_houses[i],
                                      self.cfg)
            self.env_loaded[self.all_houses[i].house['id']] = House3DUtils(
                environment,
                target_obj_conn_map_dir=self.target_obj_conn_map_dir,
                build_graph=False)

        # [TODO] Unused till now
        self.env_ptr = -1

        print('[%.02f] Loaded %d house3d envs' %
              (time.time() - start, len(self.env_loaded)))

        # Mark available data indices
        self.available_idx = [
            i for i, v in enumerate(self.env_list) if v in self.env_loaded
        ]

        # [TODO] only keeping legit sequences
        # needed for things to play well with old data
        temp_available_idx = self.available_idx.copy()
        for i in range(len(temp_available_idx)):
            if self.action_lengths[temp_available_idx[i]] < 5:
                self.available_idx.remove(temp_available_idx[i])

        print('Available inds: %d' % len(self.available_idx))

        # Flag to check if loaded envs have been cycled through or not
        # [TODO] Unused till now
        self.all_envs_loaded = False

    def _clear_api_threads(self):
        for i in range(len(self.api_threads)):
            del self.api_threads[0]
        self.api_threads = []

    def _clear_memory(self):
        if hasattr(self, 'episode_house'):
            del self.episode_house
        if hasattr(self, 'env_loaded'):
            del self.env_loaded
        if hasattr(self, 'api_threads'):
            del self.api_threads
        self.api_threads = []

    def _check_if_all_envs_loaded(self):
        print('[CHECK][Cache:%d][Total:%d]' %
              (len(self.img_data_cache), len(self.env_list)))
        if len(self.img_data_cache) == len(self.env_list):
            self.available_idx = [i for i, v in enumerate(self.env_list)]
            return True
        else:
            return False

    def set_camera(self, e, pos, robot_height=1.0):
        assert len(pos) == 4

        e.env.cam.pos.x = pos[0]
        e.env.cam.pos.y = robot_height
        e.env.cam.pos.z = pos[2]
        e.env.cam.yaw = pos[3]

        e.env.cam.updateDirection()

    def render(self, e):
        return e.env.render()

    def get_frames(self, e, pos_queue, preprocess=True):
        if isinstance(pos_queue, list) == False:
            pos_queue = [pos_queue]

        res = []
        for i in range(len(pos_queue)):
            self.set_camera(e, pos_queue[i])
            img = np.array(self.render(e), copy=False, dtype=np.float32)

            if preprocess == True:
                img = img.transpose(2, 0, 1)
                img = img / 255.0

            res.append(img)

        return np.array(res)

    def get_hierarchical_features_till_spawn(self,
                                             actions,
                                             backtrack_steps=0,
                                             max_controller_actions=5):

        action_length = len(actions) - 1
        pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(
            actions=actions, controller_action_lim=max_controller_actions)

        # count how many actions of same type have been encountered pefore starting navigation
        backtrack_controller_steps = actions[1:action_length -
                                             backtrack_steps + 1:][::-1]
        counter = 0
        # Removed try/except here to try to tease out pdb-related errors in Abhishek's code that are firing in other
        # parts of training for me as well.
        # try:
        if len(backtrack_controller_steps) > 0:
            # Edited condition: counter <= len(backtrack_controller_steps to strictly less than to avoid out of bounds
            # error on following loop; unsure what cascading problems that might cause since I don't know the downsteam
            # logic for how counter is used, but the loop as written was asking for a bug in execution get getting it.
            # I also reversed the order of the conditions so that the index check is -after- the verification that
            # counter is within bounds, since otherwise it doesn't fire until after the out of bounds error has
            # happened (tho, again, maybe this will cause downstream issues if counter is supposed to be allowed to
            # float up to value len(backtrack_controller_steps) + 1, which is now higher than it can reach).
            while ((counter <= self.max_controller_actions)
                   and (counter < len(backtrack_controller_steps))
                   and (backtrack_controller_steps[counter]
                        == backtrack_controller_steps[0])):
                counter += 1
        # except:
        # import pdb;
        # pdb.set_trace() #If you have breakpoint here, you probably found an error in the logit above to figure out the correct counter step.  Still working on this and checking.

        target_pos_idx = action_length - backtrack_steps

        controller_step = True
        if target_pos_idx in pq_idx:
            controller_step = False

        pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx]
        pa_pruned = pa[:len(pq_idx_pruned) + 1]

        images = self.get_frames(self.episode_house,
                                 self.episode_pos_queue,
                                 preprocess=True)
        raw_img_feats = self.cnn(Variable(
            torch.FloatTensor(images).cuda())).data.cpu().numpy().copy()

        controller_img_feat = torch.from_numpy(
            raw_img_feats[target_pos_idx].copy())
        controller_action_in = pa_pruned[-1] - 2

        planner_img_feats = torch.from_numpy(
            raw_img_feats[pq_idx_pruned].copy())
        planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1)

        return planner_actions_in, planner_img_feats, controller_step, controller_action_in, \
            controller_img_feat, self.episode_pos_queue[target_pos_idx], counter

    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type == 'ques':
            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            return (idx, question, answer)

        # [VQA] question+image
        elif self.input_type == 'ques,image':
            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames +
                                  1:action_length + 1]

            if self.to_cache == True and index in self.img_data_cache:
                images = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index][
                    -self.num_frames:]  # last 5 frames
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                if self.to_cache == True:
                    self.img_data_cache[index] = images.copy()

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

        # [NAV] question+cnn
        elif self.input_type in ['cnn', 'cnn+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.to_cache == True and index in self.img_data_cache:
                img_feats = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index]
                images = self.get_frames(self.env_loaded[self.env_list[index]],
                                         pos_queue,
                                         preprocess=True)
                img_feats = self.cnn(Variable(
                    torch.FloatTensor(
                        images).cuda())).data.cpu().numpy().copy()
                if self.to_cache == True:
                    self.img_data_cache[index] = img_feats

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                actions_in = actions[:action_length]
                actions_out = actions[1:action_length + 1] - 2

                return (idx, question, answer, img_feats, actions_in,
                        actions_out, action_length)

            # if action_length is n
            # images.shape[0] is also n
            # actions[0] is <START>
            # actions[n] is <END>

            # grab 5 random frames
            # [NOTE]: this'll break for longer-than-5 navigation sequences
            start_idx = np.random.choice(img_feats.shape[0] + 1 -
                                         self.num_frames)
            img_feats = img_feats[start_idx:start_idx + self.num_frames]

            actions_in = actions[start_idx:start_idx + self.num_frames]
            actions_out = actions[start_idx + self.num_frames] - 2

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length)

        # [NAV] question+lstm
        elif self.input_type in ['lstm', 'lstm+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(
                            images).cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            actions_in = actions.clone() - 1
            actions_out = actions[1:].clone() - 2

            actions_in[action_length:].fill_(0)
            mask = actions_out.clone().gt(-1)
            if len(actions_out) > action_length:
                actions_out[action_length:].fill_(0)

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                return (idx, question, answer, False, actions_in, actions_out,
                        action_length, mask)

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length, mask)

        # [NAV] planner-controller
        elif self.input_type in ['pacman']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            planner_actions = self.planner_actions[index]
            controller_actions = self.controller_actions[index]

            planner_action_length = self.planner_action_lengths[index]
            controller_action_length = self.controller_action_lengths[index]

            planner_hidden_idx = self.planner_hidden_idx[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(
                            images).cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[0], :] = raw_img_feats.copy(
                    )
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[
                        self.env_list[index]].objects[obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[
                        self.env_list[index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                if target_obj_id == False or target_room == False:
                    return None
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[
                        self.env_list[index]].objects[target_obj_id],
                    target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[
                    self.env_list[index]].objects[target_obj_id]

                return (idx, question, answer, actions, action_length)

            planner_pos_queue_idx = self.planner_pos_queue_idx[index]
            controller_pos_queue_idx = self.controller_pos_queue_idx[index]

            planner_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            planner_img_feats[:planner_action_length] = img_feats[
                planner_pos_queue_idx]

            planner_actions_in = planner_actions.clone() - 1
            planner_actions_out = planner_actions[1:].clone() - 2

            planner_actions_in[planner_action_length:].fill_(0)
            planner_mask = planner_actions_out.clone().gt(-1)
            if len(planner_actions_out) > planner_action_length:
                planner_actions_out[planner_action_length:].fill_(0)

            controller_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            controller_img_feats[:controller_action_length] = img_feats[
                controller_pos_queue_idx]

            controller_actions_in = actions[1:].clone() - 2
            if len(controller_actions_in) > controller_action_length:
                controller_actions_in[controller_action_length:].fill_(0)

            controller_out = controller_actions
            controller_mask = controller_out.clone().gt(-1)
            if len(controller_out) > controller_action_length:
                controller_out[controller_action_length:].fill_(0)

            # zero out forced controller return
            for i in range(controller_action_length):
                if i >= self.max_controller_actions - 1 and controller_out[i] == 0 and \
                        (self.max_controller_actions == 1 or
                         controller_out[i - self.max_controller_actions + 1:i].sum()
                         == self.max_controller_actions - 1):
                    controller_mask[i] = 0

            return (idx, question, answer, planner_img_feats,
                    planner_actions_in, planner_actions_out,
                    planner_action_length, planner_mask, controller_img_feats,
                    controller_actions_in, planner_hidden_idx, controller_out,
                    controller_action_length, controller_mask)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.available_idx)
Ejemplo n.º 3
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 data_json=False,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 to_cache=False,
                 target_obj_conn_map_dir=False,
                 map_resolution=1000):
        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        self.num_frames = num_frames

        np.random.seed()

        self.data_json = data_json
        self.split = split
        self.gpu_id = gpu_id

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu

        self.target_obj_conn_map_dir = target_obj_conn_map_dir
        self.map_resolution = map_resolution

        self.to_cache = to_cache
        self.img_data_cache = {}

        if self.data_json != False:
            data = json.load(open(self.data_json, 'r'))
            self.envs = data['envs']

            self.env_idx = data[self.split + '_env_idx']
            self.env_list = [self.envs[x] for x in self.env_idx]
            self.env_set = list(set(self.env_list))
            self.env_set.sort()

            print('Total envs: %d' % len(list(set(self.envs))))
            print('Envs in %s: %d' % (self.split,
                                      len(list(set(self.env_idx)))))

            if input_type != 'ques':
                ''''
                If training, randomly sample and load a subset of environments,
                train on those, and then cycle through to load the rest.

                On the validation and test set, load in order, and cycle through.

                For both, add optional caching so that if all environments
                have been cycled through once, then no need to re-load and
                instead, just the cache can be used.
                '''

                self.api_threads = []
                self._load_envs(start_idx=0, in_order=True)

                cnn_kwargs = {'num_classes': 191, 'pretrained': True}
                self.cnn = MultitaskCNN(**cnn_kwargs)
                self.cnn.eval()
                self.cnn.cuda()

            self.pos_queue = data[self.split + '_pos_queue']
            self.boxes = data[self.split + '_boxes']

        print('Reading question data into memory')
        self.idx = _dataset_to_tensor(questions_h5['idx'])
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['action_labels'])
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])

        if input_type == 'pacman':

            self.planner_actions = self.actions.clone().fill_(0)
            self.controller_actions = self.actions.clone().fill_(-1)

            self.planner_action_lengths = self.action_lengths.clone().fill_(0)
            self.controller_action_lengths = self.action_lengths.clone().fill_(
                0)

            self.planner_hidden_idx = self.actions.clone().fill_(0)

            self.planner_pos_queue_idx, self.controller_pos_queue_idx = [], []

            # parsing flat actions to planner-controller hierarchy
            for i in tqdm(range(len(self.actions))):

                pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(self.actions[i][:self.action_lengths[i]+1])

                self.planner_actions[i][:len(pa)] = torch.Tensor(pa)
                self.controller_actions[i][:len(ca)] = torch.Tensor(ca)

                self.planner_action_lengths[i] = len(pa)-1
                self.controller_action_lengths[i] = len(ca)

                self.planner_pos_queue_idx.append(pq_idx)
                self.controller_pos_queue_idx.append(cq_idx)

                self.planner_hidden_idx[i][:len(ca)] = torch.Tensor(ph_idx)

    def _pick_envs_to_load(self,
                           split='train',
                           max_envs=10,
                           start_idx=0,
                           in_order=False):
        if split in ['val', 'test'] or in_order == True:
            pruned_env_set = self.env_set[start_idx:start_idx + max_envs]
        else:
            if max_envs < len(self.env_set):
                env_inds = np.random.choice(
                    len(self.env_set), max_envs, replace=False)
            else:
                env_inds = np.random.choice(
                    len(self.env_set), max_envs, replace=True)
            pruned_env_set = [self.env_set[x] for x in env_inds]
        return pruned_env_set

    def _load_envs(self, start_idx=-1, in_order=False):
        if start_idx == -1:
            start_idx = self.env_set.index(self.pruned_env_set[-1]) + 1

        # Pick envs
        self.pruned_env_set = self._pick_envs_to_load(
            split=self.split,
            max_envs=self.max_threads_per_gpu,
            start_idx=start_idx,
            in_order=in_order)

        if len(self.pruned_env_set) == 0:
            return

        # Load api threads
        start = time.time()
        if len(self.api_threads) == 0:
            for i in range(len(self.pruned_env_set)):
                self.api_threads.append(
                    objrender.RenderAPIThread(
                        w=224, h=224, device=self.gpu_id))

        self.cfg = load_config('../House3D/tests/config.json')

        print('[%.02f] Loaded %d api threads' % (time.time() - start,
                                                 len(self.api_threads)))
        start = time.time()

        # Load houses
        from multiprocessing import Pool
        _args = ([h, self.cfg, self.map_resolution]
                 for h in self.pruned_env_set)
        with Pool(len(self.pruned_env_set)) as pool:
            self.all_houses = pool.starmap(local_create_house, _args)

        print('[%.02f] Loaded %d houses' % (time.time() - start,
                                            len(self.all_houses)))
        start = time.time()

        # Load envs
        self.env_loaded = {}
        for i in range(len(self.all_houses)):
            print('[%02d/%d][split:%s][gpu:%d][house:%s]' %
                  (i + 1, len(self.all_houses), self.split, self.gpu_id,
                   self.all_houses[i].house['id']))
            self.env_loaded[self.all_houses[i].house['id']] = House3DUtils(
                Environment(self.api_threads[i], self.all_houses[i], self.cfg),
                target_obj_conn_map_dir=self.target_obj_conn_map_dir,
                build_graph=False)

        # [TODO] Unused till now
        self.env_ptr = -1

        print('[%.02f] Loaded %d house3d envs' % (time.time() - start,
                                                  len(self.env_loaded)))

        # Mark available data indices
        self.available_idx = [
            i for i, v in enumerate(self.env_list) if v in self.env_loaded
        ]
        print('Available inds: %d' % len(self.available_idx))

        # Flag to check if loaded envs have been cycled through or not
        # [TODO] Unused till now
        self.all_envs_loaded = False

    def _clear_api_threads(self):
        for i in range(len(self.api_threads)):
            del self.api_threads[0]
        self.api_threads = []

    def _check_if_all_envs_loaded(self):
        print('[CHECK][Cache:%d][Total:%d]' % (len(self.img_data_cache),
                                               len(self.env_list)))
        if len(self.img_data_cache) == len(self.env_list):
            self.available_idx = [i for i, v in enumerate(self.env_list)]
            return True
        else:
            return False

    def set_camera(self, e, pos, robot_height=1.0):
        assert len(pos) == 4

        e.env.cam.pos.x = pos[0]
        e.env.cam.pos.y = robot_height
        e.env.cam.pos.z = pos[2]
        e.env.cam.yaw = pos[3]

        e.env.cam.updateDirection()

    def render(self, e):
        return e.env.render()

    def get_frames(self, e, pos_queue, preprocess=True):
        if isinstance(pos_queue, list) == False:
            pos_queue = [pos_queue]

        res = []
        for i in range(len(pos_queue)):
            self.set_camera(e, pos_queue[i])
            img = np.array(self.render(e), copy=False, dtype=np.float32)

            if preprocess == True:
                img = img.transpose(2, 0, 1)
                img = img / 255.0

            res.append(img)

        return np.array(res)

    def get_hierarchical_features_till_spawn(self, actions, backtrack_steps=0):

        action_length = len(actions)-1
        pa, ca, pq_idx, cq_idx, ph_idx = flat_to_hierarchical_actions(actions)

        target_pos_idx = action_length - backtrack_steps

        controller_step = True
        if target_pos_idx in pq_idx:
            controller_step = False

        pq_idx_pruned = [v for v in pq_idx if v <= target_pos_idx]
        pa_pruned = pa[:len(pq_idx_pruned)+1]

        images = self.get_frames(
            self.episode_house,
            self.episode_pos_queue,
            preprocess=True)
        raw_img_feats = self.cnn(
            Variable(torch.FloatTensor(images)
                     .cuda())).data.cpu().numpy().copy()

        controller_img_feat, controller_action_in = False, False
        if controller_step == True:
            controller_img_feat = torch.from_numpy(raw_img_feats[target_pos_idx].copy())
            controller_action_in = pa_pruned[-1] - 2

        planner_img_feats = torch.from_numpy(raw_img_feats[pq_idx_pruned].copy())
        planner_actions_in = torch.from_numpy(np.array(pa_pruned[:-1]) - 1)

        return planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, self.episode_pos_queue[target_pos_idx]


    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type == 'ques':
            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            return (idx, question, answer)

        # [VQA] question+image
        elif self.input_type == 'ques,image':
            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames + 1:
                                  action_length + 1]

            if self.to_cache == True and index in self.img_data_cache:
                images = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index][
                    -self.num_frames:]  # last 5 frames
                images = self.get_frames(
                    self.env_loaded[self.env_list[index]],
                    pos_queue,
                    preprocess=True)
                if self.to_cache == True:
                    self.img_data_cache[index] = images.copy()

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

        # [NAV] question+cnn
        elif self.input_type in ['cnn', 'cnn+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.to_cache == True and index in self.img_data_cache:
                img_feats = self.img_data_cache[index]
            else:
                pos_queue = self.pos_queue[index]
                images = self.get_frames(
                    self.env_loaded[self.env_list[index]],
                    pos_queue,
                    preprocess=True)
                img_feats = self.cnn(
                    Variable(torch.FloatTensor(images)
                             .cuda())).data.cpu().numpy().copy()
                if self.to_cache == True:
                    self.img_data_cache[index] = img_feats

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                actions_in = actions[:action_length]
                actions_out = actions[1:action_length + 1] - 2

                return (idx, question, answer, img_feats, actions_in,
                        actions_out, action_length)

            # if action_length is n
            # images.shape[0] is also n
            # actions[0] is <START>
            # actions[n] is <END>

            # grab 5 random frames
            # [NOTE]: this'll break for longer-than-5 navigation sequences
            start_idx = np.random.choice(img_feats.shape[0] + 1 -
                                         self.num_frames)
            img_feats = img_feats[start_idx:start_idx + self.num_frames]

            actions_in = actions[start_idx:start_idx + self.num_frames]
            actions_out = actions[start_idx + self.num_frames] - 2

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length)

        # [NAV] question+lstm
        elif self.input_type in ['lstm', 'lstm+q']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images)
                                 .cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[
                        0], :] = raw_img_feats.copy()
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            actions_in = actions.clone() - 1
            actions_out = actions[1:].clone() - 2

            actions_in[action_length:].fill_(0)
            mask = actions_out.clone().gt(-1)
            if len(actions_out) > action_length:
                actions_out[action_length:].fill_(0)

            # for val or test (evaluation), or
            # when target_obj_conn_map_dir is defined (reinforce),
            # load entire shortest path navigation trajectory
            # and load connectivity map for intermediate rewards
            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                return (idx, question, answer, False, actions_in, actions_out,
                        action_length, mask)

            return (idx, question, answer, img_feats, actions_in, actions_out,
                    action_length, mask)

        # [NAV] planner-controller
        elif self.input_type in ['pacman']:

            index = self.available_idx[index]

            idx = self.idx[index]
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            planner_actions = self.planner_actions[index]
            controller_actions = self.controller_actions[index]

            planner_action_length = self.planner_action_lengths[index]
            controller_action_length = self.controller_action_lengths[index]

            planner_hidden_idx = self.planner_hidden_idx[index]

            if self.split == 'train':
                if self.to_cache == True and index in self.img_data_cache:
                    img_feats = self.img_data_cache[index]
                else:
                    pos_queue = self.pos_queue[index]
                    images = self.get_frames(
                        self.env_loaded[self.env_list[index]],
                        pos_queue,
                        preprocess=True)
                    raw_img_feats = self.cnn(
                        Variable(torch.FloatTensor(images)
                                 .cuda())).data.cpu().numpy().copy()
                    img_feats = np.zeros(
                        (self.actions.shape[1], raw_img_feats.shape[1]),
                        dtype=np.float32)
                    img_feats[:raw_img_feats.shape[
                        0], :] = raw_img_feats.copy()
                    if self.to_cache == True:
                        self.img_data_cache[index] = img_feats

            if self.split in ['val', 'test'
                              ] or self.target_obj_conn_map_dir != False:
                target_obj_id, target_room = False, False
                bbox_obj = [
                    x for x in self.boxes[index]
                    if x['type'] == 'object' and x['target'] == True
                ][0]['box']
                for obj_id in self.env_loaded[self.env_list[index]].objects:
                    box2 = self.env_loaded[self.env_list[index]].objects[
                        obj_id]['bbox']
                    if all([bbox_obj['min'][x] == box2['min'][x] for x in range(3)]) == True and \
                        all([bbox_obj['max'][x] == box2['max'][x] for x in range(3)]) == True:
                        target_obj_id = obj_id
                        break
                bbox_room = [
                    x for x in self.boxes[index]
                    if x['type'] == 'room' and x['target'] == False
                ][0]
                for room in self.env_loaded[self.env_list[
                        index]].env.house.all_rooms:
                    if all([room['bbox']['min'][i] == bbox_room['box']['min'][i] for i in range(3)]) and \
                        all([room['bbox']['max'][i] == bbox_room['box']['max'][i] for i in range(3)]):
                        target_room = room
                        break
                assert target_obj_id != False
                assert target_room != False
                self.env_loaded[self.env_list[index]].set_target_object(
                    self.env_loaded[self.env_list[index]].objects[
                        target_obj_id], target_room)

                # [NOTE] only works for batch size = 1
                self.episode_pos_queue = self.pos_queue[index]
                self.episode_house = self.env_loaded[self.env_list[index]]
                self.target_room = target_room
                self.target_obj = self.env_loaded[self.env_list[
                    index]].objects[target_obj_id]

                return (idx, question, answer, actions, action_length)

            planner_pos_queue_idx = self.planner_pos_queue_idx[index]
            controller_pos_queue_idx = self.controller_pos_queue_idx[index]

            planner_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            planner_img_feats[:planner_action_length] = img_feats[
                planner_pos_queue_idx]

            planner_actions_in = planner_actions.clone() - 1
            planner_actions_out = planner_actions[1:].clone() - 2

            planner_actions_in[planner_action_length:].fill_(0)
            planner_mask = planner_actions_out.clone().gt(-1)
            if len(planner_actions_out) > planner_action_length:
                planner_actions_out[planner_action_length:].fill_(0)

            controller_img_feats = np.zeros(
                (self.actions.shape[1], img_feats.shape[1]), dtype=np.float32)
            controller_img_feats[:controller_action_length] = img_feats[
                controller_pos_queue_idx]

            controller_actions_in = actions[1:].clone() - 2
            if len(controller_actions_in) > controller_action_length:
                controller_actions_in[controller_action_length:].fill_(0)

            controller_out = controller_actions
            controller_mask = controller_out.clone().gt(-1)
            if len(controller_out) > controller_action_length:
                controller_out[controller_action_length:].fill_(0)

            return (idx, question, answer, planner_img_feats,
                    planner_actions_in, planner_actions_out,
                    planner_action_length, planner_mask, controller_img_feats,
                    controller_actions_in, planner_hidden_idx, controller_out,
                    controller_action_length, controller_mask)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.available_idx)
Ejemplo n.º 4
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 map_resolution=1000):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        np.random.seed()

        self.split = split
        self.gpu_id = gpu_id
        self.num_frames = num_frames

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu
        self.map_resolution = map_resolution

        print('Reading question data into memory')
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['actions'])
        self.actions = self.actions.unsqueeze(2)
        self.robot_positions = _dataset_to_tensor(
            questions_h5['robot_positions'], dtype=np.float32)
        self.action_images = questions_h5['images']
        self.action_lengths = _dataset_to_tensor(
            questions_h5['action_lengths'])
        self.action_masks = _dataset_to_tensor(questions_h5['mask'])

        #if input_type != 'ques':
        '''
        If training, randomly sample and load a subset of environments,
        train on those, and then cycle through to load the rest.

        On the validation and test set, load in order, and cycle through.

        For both, add optional caching so that if all environments
        have been cycled through once, then no need to re-load and
        instead, just the cache can be used.
        '''
        cnn_kwargs = {'num_classes': 191, 'pretrained': True}
        self.cnn = MultitaskCNN(**cnn_kwargs)
        self.cnn.eval()
        self.cnn.cuda()

    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type in ['pacman']:

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]

            if self.split in ['val', 'test']:  #return the data directly
                return (idx, question, answer, actions, robot_positions,
                        action_lengths)

            if self.split == 'train':  #get iamge from data_set
                planner_images = self.action_images[index]
                planner_img_feats = self.cnn(
                    Variable(torch.FloatTensor(
                        planner_images).cuda())).data.cpu().numpy().copy()
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()

            return (idx, question, answer, planner_img_feats, actions_in,
                    actions_out, robot_positions, actions_masks,
                    action_lengths)

        elif self.input_type == 'ques,image':
            idx = index
            question = self.questions[index]
            answer = self.answers[index]

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames +
                                  1:action_length + 1]

            images = self.action_images[index][
                action_length - self.num_frames:action_length].astype(
                    np.float32)

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)

    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.questions)
Ejemplo n.º 5
0
def test(rank, test_model_dir):
    model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
    model = NavPlannerControllerModel(**model_kwargs)
    checkpoint = torch.load(test_model_dir)  #load check point
    model.load_state_dict(checkpoint['state'])  #create model

    cnn_kwargs = {'num_classes': 191, 'pretrained': True}
    cnn = MultitaskCNN(**cnn_kwargs)
    cnn.eval()
    cnn.cuda()  #create cnn model

    scene = "test-10-obj-00.txt"
    my_env = enviroment.Environment(is_testing=1, testing_file=scene)
    object_exist_list = my_env.ur5.object_type
    print("the objetct which is exist:")
    print(object_exist_list)  #create simulation enviroment

    my_question = Qusetion(object_exist_list)  #create testing question
    testing_questions = my_question.createQueue()
    vocab = my_question.create_vocab()

    for question in testing_questions:
        planner_hidden = None
        max_action = 30
        position = [0, 0]
        action_in_raw = [0]  #start action_in
        actions = []

        print(question['question'])  #question
        questionTokens = my_question.tokenize(question['question'],
                                              punctToRemove=['?'],
                                              addStartToken=False)
        encoded_question_raw = my_question.encode(questionTokens,
                                                  vocab['questionTokenToIdx'])
        encoded_question_raw.append(0)  #encode question
        encoded_question_raw = np.array(encoded_question_raw)
        encoded_question_tensor = _dataset_to_tensor(encoded_question_raw)
        encoded_question = Variable(encoded_question_tensor)
        encoded_question = encoded_question.unsqueeze(0)
        action_times = 0

        while (action_times < max_action):

            #print(planner_img_feats_var.size())
            action_in_tensor = _dataset_to_tensor(action_in_raw)
            action_in = Variable(action_in_tensor)
            action_in = action_in.unsqueeze(0)
            action_in = action_in.unsqueeze(0)

            _, rgb_image_raw = my_env.camera.get_camera_data()
            position_in, planner_img_feats_var = data2input(
                position, rgb_image_raw, cnn)

            output_data, planner_hidden = model.planner_step(
                encoded_question, planner_img_feats_var, action_in,
                position_in, planner_hidden)
            planner_possi = F.log_softmax(output_data, dim=1)
            planner_data = planner_possi.data.numpy()
            planner_data = planner_data[0]
            action_out = np.where(planner_data == np.max(planner_data))
            action_out = action_out[0][0]

            actions.append(action_out)
            action_in_raw = [action_out]
            if action_out == 9:
                print('stop')
                break
            else:
                dx, dy = order2action(action_out)
                position[0] += dx
                position[1] += dy
            action_times += 1

        if len(actions) > 2 and len(actions) < 20:
            action_position = position + position
            my_env.UR5_action(action_position, 2)  #sucking
        elif len(actions) >= 20:  #pushing
            position_start = [0, 0]
            position_end = [0, 0]
            for i in range(len(actions)):
                if i < len(actions) / 2:  #the first step
                    dx, dy = order2action(actions[i])
                    position_start[0] += dx
                    position_start[1] += dy
                    position_end[0] += dx
                    position_end[1] += dy
                else:  #the second step
                    dx, dy = order2action(actions[i])
                    position_end[0] += dx
                    position_end[1] += dy
            action_position = position_start + position_end
            my_env.UR5_action(action_position, 1)  #pushing
Ejemplo n.º 6
0
class EqaDataset(Dataset):
    def __init__(self,
                 questions_h5,
                 vocab,
                 num_frames=1,
                 split='train',
                 gpu_id=0,
                 input_type='ques',
                 max_threads_per_gpu=10,
                 map_resolution=1000):

        self.questions_h5 = questions_h5
        self.vocab = load_vocab(vocab)
        np.random.seed()
        
        self.split = split
        self.gpu_id = gpu_id
        self.num_frames = num_frames

        self.input_type = input_type

        self.max_threads_per_gpu = max_threads_per_gpu
        self.map_resolution = map_resolution


        print('Reading question data into memory')
        self.questions = _dataset_to_tensor(questions_h5['questions'])
        self.answers = _dataset_to_tensor(questions_h5['answers'])
        self.actions = _dataset_to_tensor(questions_h5['actions'])
        self.actions = self.actions.unsqueeze(2)
        self.robot_positions = _dataset_to_tensor(questions_h5['robot_positions'],dtype = np.float32)
        self.action_images = questions_h5['images']
        self.action_maps = questions_h5['heatmaps']
        self.action_lengths = _dataset_to_tensor(questions_h5['action_lengths'])
        self.action_masks = _dataset_to_tensor(questions_h5['mask'])


        cnn_kwargs = {'num_classes': 191, 'pretrained': True}
        self.cnn = MultitaskCNN(**cnn_kwargs)
        self.cnn.eval()
        self.cnn.cuda()




    
    def __getitem__(self, index):
        # [VQA] question-only
        if self.input_type in ['nomap']:

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]


            if self.split in ['val', 'test']:    #return the data directly
                return (idx, question, answer, actions, robot_positions,action_lengths)  

            if self.split == 'train':                      #get iamge from data_set
                planner_images = self.action_images[index][0]
                planner_var = Variable(torch.FloatTensor(planner_images)
                                 .cuda())
                planner_var = planner_var.unsqueeze(0)
                
                planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy()  
                                            
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()
                     
            return (idx, question, answer, planner_img_feats,
                    actions_in, actions_out,
                   robot_positions, actions_masks,action_lengths)

        elif self.input_type == 'addmap':

            idx = index
            question = self.questions[index]
            #answer = self.answers[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1
            actions = self.actions[index]
            actions_masks = self.action_masks[index]
            robot_positions = self.robot_positions[index]
            action_lengths = self.action_lengths[index]


            if self.split in ['val', 'test']:    #return the data directly
                return (idx, question, answer, actions, robot_positions,action_lengths)  

            if self.split == 'train':                      #get iamge from data_set
                planner_images = self.action_images[index][0]
                planner_var = Variable(torch.FloatTensor(planner_images)
                                 .cuda())
                planner_var = planner_var.unsqueeze(0)
                
                planner_img_feats = self.cnn(planner_var).data.cpu().numpy().copy()  

                planner_maps = self.action_maps[index][0]
                planner_maps_feats = Variable(torch.FloatTensor(planner_maps)
                                 .cuda())
                

                #planner_maps_feats = planner_maps_var.view(-1,32*32*20)
                                            
                actions_in = actions.clone()
                actions_out = actions[1:].clone()
                actions_masks = actions_masks[:39].clone().gt(0)
                robot_positions = robot_positions.clone()
                     
            return (idx, question, answer, planner_img_feats,
                   planner_maps_feats,actions_in, actions_out,
                   robot_positions, actions_masks,action_lengths)


        elif self.input_type == 'ques,image':
            idx = index
            question = self.questions[index]
            answer = self.answers[index]
            if answer > 13:
                answer = answer - 1

            action_length = self.action_lengths[index]
            actions = self.actions[index]

            actions_in = actions[action_length - self.num_frames:action_length]
            actions_out = actions[action_length - self.num_frames + 1:
                                  action_length + 1]

            images = self.action_images[index][0:2].astype(np.float32)

            return (idx, question, answer, images, actions_in, actions_out,
                    action_length)




    def __len__(self):
        if self.input_type == 'ques':
            return len(self.questions)
        else:
            return len(self.questions)