コード例 #1
0
    def roll_out_in_env(self,
                        start,
                        goal,
                        ultimate_goal,
                        horizon,
                        mode='test'):
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        roll_out = Batch()
        break_var = False
        s = start
        d = False

        s_list = []
        a_list = []
        r_list = []

        for step_i in range(horizon):
            self.episode_steps += 1
            s_bar = self.grid.lls2hls(s)

            # GET THE TARGET VECTOR
            target_vec = (goal[0] - s[0], goal[1] - s[1])
            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, _, info = self.grid.step(action=10 * a)
            s_bar_cand = self.grid.lls2hls(s_new)

            d = (s_bar_cand[0] == goal[0]) and (s_bar_cand[1] == goal[1])
            if d:
                info = False
                r = 1.0

            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (
                self.episode_steps == self.config.time_horizon)

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [0 if break_var else 1],
                      [info], [1.0]))
            s = s_new
            if break_var:
                break

        return roll_out, s_list, a_list, r_list, d, s_new, s_bar_cand
コード例 #2
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0

        if mode == 'train':
            while num_steps < self.iter_size:

                roll_out, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env(
                    horizon=self.max_iter,
                    mode='train')

                if self.total_steps > self.start_time_steps:
                    for _ in range(40):
                        self.manager_policy.train(self.replay_buffer, self.batch_size)

                batch.append(roll_out)
                num_roll_outs += 1
                num_steps += steps

                total_success += success

            return batch, total_success / num_roll_outs, num_steps / num_roll_outs

        else:
            _, steps, states, actions, rewards, success, start_pos, goal_pos = self.roll_out_in_env(
                horizon=self.max_iter,
                mode='test')

        return success
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:

                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)

                roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                            goal=goal_pos,
                                                                                            ultimate_goal=goal_pos,
                                                                                            horizon=self.max_iter)

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                jwp = 1.
                num_roll_outs += 1
                num_steps += roll_out.length()
                batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)

            roll_out, states, actions, rewards, success, l_state = self.roll_out_in_env(start=start_pos,
                                                                                        goal=goal_pos,
                                                                                        ultimate_goal=goal_pos,
                                                                                        horizon=self.max_iter)

            st_bar = self.grid.lls2hls(l_state)
            success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            num_roll_outs += 1
            num_steps += roll_out.length()
            batch.append(roll_out)

            total_success += success
            j += 1

            return success
コード例 #4
0
    def roll_out_in_env(self, start, horizon):
        roll_out = Batch()
        s = start
        s_list = []
        a_list = []
        r_list = []

        success_var = False
        oob = False
        break_var = False

        for step_i in range(horizon):

            target_vec = s['desired_goal'] - s['observation']
            s_save = copy.deepcopy(
                np.array(
                    list(target_vec[:2]) + list(s['observation'][2:4]) +
                    list(target_vec[2:4])))

            s_list.append(copy.deepcopy(s['observation']))
            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            s_new, r, d, info = self.grid.env.step(a)
            success_var = info["is_success"]
            info = not info["is_success"]

            r = 0.0
            if success_var:
                r = 1.0
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon)

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if (break_var or oob) else 1], [info], [1.0]))

            s = s_new
            if break_var or oob:
                break

        s_list.append(copy.deepcopy(s['observation']))

        return roll_out, s_list, a_list, r_list, success_var
    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        roll_out = Batch()
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []
        d = False

        for step_i in range(horizon):
            s_bar = self.grid.lls2hls(s)
            target_vec = goal - s[:2]
            s_save = copy.deepcopy(np.array(list(s[2:]) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                                            list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, d_wp, info = self.grid.step(action=10*a)

            s_bar_cand = self.grid.lls2hls(s_new)

            break_var = False
            success_var = ((s_bar_cand[0] == s_u_goal[0]) and (s_bar_cand[1] == s_u_goal[1]))

            if success_var:
                info = False
                break_var = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new.astype(np.float32)],
                      [0 if ((not info) or (step_i + 1 == horizon) or break_var) else 1],
                      [info],
                      [1.0]))
            s = s_new
            if (not info) or break_var:
                break
        return roll_out, s_list, a_list, r_list, d, s_new
コード例 #6
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()

                roll_out, _, _, _, success = self.roll_out_in_env(
                    start=s_init, horizon=self.max_iter)

                jwp = 1.
                num_roll_outs += 1
                num_steps += roll_out.length()
                batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()

            roll_out, state_list, action_list, reward_list, success = self.roll_out_in_env(
                start=s_init, horizon=self.max_iter)

            num_roll_outs += 1
            num_steps += roll_out.length()
            batch.append(roll_out)

            total_success += success
            j += 1

        return success
コード例 #7
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0
                """ V MAP """
                if self.config.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=True)
                else:
                    self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                           walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1]))
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)

                    s_bar = self.grid.lls2hls(st)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = int(pi[s_bar[0], s_bar[1]])
                    else:
                        a_bar = randint(0, 7)

                    self.vi.set_target(s_bar, a_bar)
                    curr_goal = self.vi.get_target()

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos,
                        mode='train')

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    st = l_state[:2]
                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                if not self.config.optimistic_model:
                    x_temp, y_temp, w_temp = self.vi.generate_dataset_flat(
                        self.grid.terrain_map, hl_s_list, hl_a_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40,
                                      rw=True)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0
            """ V MAP """
            if self.config.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=True)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.terrain_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1]))

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                a_bar = int(pi[s_bar[0], s_bar[1]])
                self.vi.set_target(s_bar, a_bar)
                curr_goal = self.vi.get_target()

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos,
                    mode='test')

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)
                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            return success
コード例 #8
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
                structure = copy.deepcopy(a)
                objects = copy.deepcopy(b)
                obstacle_list = copy.deepcopy(c)
                occupancy_map = copy.deepcopy(d)
                default_starts = copy.deepcopy(e)
                state_cache = copy.deepcopy(f)
                occupancy_map_padded = copy.deepcopy(g)
                occupancy_map_un_padded = copy.deepcopy(h)
                occupancy_map_original = copy.deepcopy(p)

                self.grid = Ant44Env0(
                    mj_ant_path=self.config.mj_ant_path,
                    re_init=True,
                    structure=structure,
                    objects=objects,
                    obstacle_list=obstacle_list,
                    occupancy_map=occupancy_map,
                    default_starts=default_starts,
                    state_cache=state_cache,
                    occupancy_map_padded=occupancy_map_padded,
                    occupancy_map_un_padded=occupancy_map_un_padded,
                    occupancy_map_original=occupancy_map_original)

                start_pos = self.grid.sample_random_pos(number=1)[0]
                goal_pos = self.grid.sample_random_pos(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                self.episode_steps = 0
                s_init = self.grid.reset(start_pos)
                """ IMAGE INPUT """
                image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
                image[0, 0, :, :] = self.grid.occupancy_map_un_padded
                image[0, 1, :, :] = -1 * np.ones(
                    (self.grid.x_size, self.grid.y_size))
                image[0, 1, s_goal[0], s_goal[1]] = 0
                image = torch.from_numpy(image).float().cuda()

                with torch.no_grad():
                    v = self.mvprop_optimizer.mvprop(image)
                    v = v.cpu().detach()
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    if self.config.mvprop_decay_type == 'exp':
                        self.eps = 0.01 + 0.99 * math.exp(
                            -1. * self.dqn_steps / self.config.mvprop_decay)

                    if torch.rand(1)[0] > self.eps:
                        with torch.no_grad():
                            options_x = [
                                s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                                s_bar[0] + 1, s_bar[0], s_bar[0] - 1,
                                s_bar[0] - 1, s_bar[0] - 1
                            ]
                            options_y = [
                                s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                                s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1,
                                s_bar[1], s_bar[1] + 1
                            ]
                            options_x = [self.bound_x(e) for e in options_x]
                            options_y = [self.bound_y(e) for e in options_y]
                            v_options = v[0, 0, options_x, options_y]
                            option = np.argmax(v_options)
                    else:
                        option = randint(0, 8)

                    if option == 0:
                        target = (s_bar[0], s_bar[1])
                    elif option == 1:
                        target = (s_bar[0], s_bar[1] + 1)
                    elif option == 2:
                        target = (s_bar[0] + 1, s_bar[1] + 1)
                    elif option == 3:
                        target = (s_bar[0] + 1, s_bar[1])
                    elif option == 4:
                        target = (s_bar[0] + 1, s_bar[1] - 1)
                    elif option == 5:
                        target = (s_bar[0], s_bar[1] - 1)
                    elif option == 6:
                        target = (s_bar[0] - 1, s_bar[1] - 1)
                    elif option == 7:
                        target = (s_bar[0] - 1, s_bar[1])
                    elif option == 8:
                        target = (s_bar[0] - 1, s_bar[1] + 1)
                    target = (max(0, min(target[0], self.grid.x_size - 1)),
                              max(0, min(target[1], self.grid.y_size - 1)))

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=target,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos)

                    s_init = l_state

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(option)

                    st = l_state[:2]
                    s_bar = self.grid.lls2hls(st)

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                ### ADD TRANSITIONS TO BUFFER
                for ep_idx in range(len(hl_a_list)):

                    self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                    hl_s_list[ep_idx + 1], hl_r_list[ep_idx],
                                    hl_d_list[ep_idx], image)

                    if self.config.mvprop_her:
                        ### GET THE HINDSIGHT GOAL TRANSITION
                        image_her = np.zeros(
                            (1, 2, self.grid.x_size, self.grid.y_size))
                        image_her[0,
                                  0, :, :] = self.grid.occupancy_map_un_padded
                        image_her[0, 1, :, :] = -1 * np.ones(
                            (self.grid.x_size, self.grid.y_size))
                        image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1]] = 0
                        image_her = torch.from_numpy(image_her).float().cuda()

                        if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and (
                                hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]):
                            hgt_reward = 0
                            hgt_done = True
                        else:
                            hgt_reward = -1
                            hgt_done = False

                        self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx],
                                        hl_s_list[ep_idx + 1], hgt_reward,
                                        hgt_done, image_her)

                ### OPTIMIZE NETWORK PARAMETERS
                for _ in range(40):
                    self.mvprop_optimizer.train(self.config.time_horizon /
                                                self.time_scale)

                # TARGET NET UPDATE
                if self.dqn_steps % self.config.mvprop_target_update_frequency == 0:
                    tau = 0.05
                    for param, target_param in zip(
                            self.mvprop_optimizer.mvprop.parameters(),
                            self.mvprop_optimizer.target_mvprop.parameters()):
                        target_param.data.copy_(tau * param.data +
                                                (1 - tau) * target_param.data)

                ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
                tree = ET.parse(ant_path)
                tree.write(self.grid.mj_ant_path + 'ant.xml')

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
            structure = copy.deepcopy(a)
            objects = copy.deepcopy(b)
            obstacle_list = copy.deepcopy(c)
            occupancy_map = copy.deepcopy(d)
            default_starts = copy.deepcopy(e)
            state_cache = copy.deepcopy(f)
            occupancy_map_padded = copy.deepcopy(g)
            occupancy_map_un_padded = copy.deepcopy(h)
            occupancy_map_original = copy.deepcopy(p)

            self.grid = Ant44Env0(
                mj_ant_path=self.config.mj_ant_path,
                re_init=True,
                structure=structure,
                objects=objects,
                obstacle_list=obstacle_list,
                occupancy_map=occupancy_map,
                default_starts=default_starts,
                state_cache=state_cache,
                occupancy_map_padded=occupancy_map_padded,
                occupancy_map_un_padded=occupancy_map_un_padded,
                occupancy_map_original=occupancy_map_original)

            start_pos = self.grid.sample_random_pos(number=1)[0]
            goal_pos = self.grid.sample_random_pos(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            self.episode_steps = 0
            s_init = self.grid.reset(start_pos)

            image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size))
            image[0, 0, :, :] = self.grid.occupancy_map_un_padded
            image[0, 1, :, :] = -1 * np.ones(
                (self.grid.x_size, self.grid.y_size))
            image[0, 1, s_goal[0], s_goal[1]] = 0
            image = torch.from_numpy(image).float().cuda()

            with torch.no_grad():
                v = self.mvprop_optimizer.target_mvprop(image)
                v = v.cpu().detach()

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                with torch.no_grad():
                    options_x = [
                        s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1,
                        s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1,
                        s_bar[0] - 1
                    ]
                    options_y = [
                        s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1],
                        s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1],
                        s_bar[1] + 1
                    ]
                    options_x = [self.bound_x(e) for e in options_x]
                    options_y = [self.bound_y(e) for e in options_y]
                    v_options = v[0, 0, options_x, options_y]
                    option = np.argmax(v_options)

                if option == 0:
                    target = (s_bar[0], s_bar[1])
                elif option == 1:
                    target = (s_bar[0], s_bar[1] + 1)
                elif option == 2:
                    target = (s_bar[0] + 1, s_bar[1] + 1)
                elif option == 3:
                    target = (s_bar[0] + 1, s_bar[1])
                elif option == 4:
                    target = (s_bar[0] + 1, s_bar[1] - 1)
                elif option == 5:
                    target = (s_bar[0], s_bar[1] - 1)
                elif option == 6:
                    target = (s_bar[0] - 1, s_bar[1] - 1)
                elif option == 7:
                    target = (s_bar[0] - 1, s_bar[1])
                elif option == 8:
                    target = (s_bar[0] - 1, s_bar[1] + 1)
                target = (max(0, min(target[0], self.grid.x_size - 1)),
                          max(0, min(target[1], self.grid.y_size - 1)))

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=target,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos)

                s_init = l_state

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
            tree = ET.parse(ant_path)
            tree.write(self.grid.mj_ant_path + 'ant.xml')

            return success
    def roll_out_in_env(self, start, goal, horizon, mode='train'):
        roll_out = Batch()
        done = False
        s = self.grid.reset(start, goal)
        s_list = []
        a_list = []
        r_list = []

        state_seq = []
        action_seq = []

        s_bar = self.grid.lls2hls(s)
        s_manager = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(goal)))
        r_manager = 0.
        if mode == 'train':
            if self.total_steps < self.start_time_steps:
                sub_goal = (4 * np.random.random((2, ))) - 2
            else:
                sub_goal = (self.manager_policy.select_action(s_manager) +
                            np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\
                    clip(-self.max_action, self.max_action)
        else:
            sub_goal = self.manager_policy.select_action(s_manager)

        self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

        s_goal = self.grid.lls2hls(goal)

        for step_i in range(horizon):

            if mode == 'train':
                self.total_steps += 1

            s_bar = self.grid.lls2hls(s)
            s_save = copy.deepcopy(
                np.array(
                    list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(self.manager.target(s))))
            s_list.append(s_save)

            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            state_seq.append(s_save)
            action_seq.append(a)

            s_new, _, _, _, _ = self.grid.step(action=10 * a)
            r = self.manager.reward(s_new)
            a_list.append(a)
            r_list.append(r)

            s_new_bar = self.grid.lls2hls(s_new)
            done = ((s_new_bar[0] == s_goal[0])
                    and (s_new_bar[1] == s_goal[1]))
            r_manager += -1 * float(not done)

            manager_update = (step_i + 1) % self.manager_time_scale == 0

            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [
                    r
                ], [s_new.astype(np.float32)], [
                    0 if
                    ((step_i + 1 == horizon) or done or manager_update) else 1
                ], [not done], [1.0]))

            s = s_new

            if manager_update or done:
                self.total_steps += 1
                s_new_manager = copy.deepcopy(
                    np.array(
                        list(s) + list(self.grid.state_cache[s_new_bar[0],
                                                             s_new_bar[1]]) +
                        list(goal)))
                if mode == 'train':
                    self.replay_buffer.add(s_manager, self.manager.sg,
                                           s_new_manager, r_manager, done,
                                           np.array(state_seq),
                                           np.array(action_seq))
                    if self.her_var:
                        s_manager_her = np.concatenate((s_manager[:12], s[:2]))
                        s_new_manager_her = np.concatenate(
                            (s_new_manager[:12], s[:2]))
                        r_manager_her = -len(state_seq) + 1
                        self.replay_buffer.add(s_manager_her, self.manager.sg,
                                               s_new_manager_her,
                                               r_manager_her, True,
                                               np.array(state_seq),
                                               np.array(action_seq))

                    state_seq = []
                    action_seq = []

                s_manager = s_new_manager
                r_manager = 0.

                if mode == 'train':
                    if self.total_steps < self.start_time_steps:
                        sub_goal = (4 * np.random.random((2, ))) - 2
                    else:
                        sub_goal = (self.manager_policy.select_action(s_manager) +
                                    np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \
                            clip(-self.max_action, self.max_action)
                else:
                    sub_goal = self.manager_policy.select_action(s_manager)

                self.manager.update(s=copy.deepcopy(s), sg=sub_goal)

            if done:
                break
        s_save = copy.deepcopy(
            np.array(
                list(s) + list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                list(self.manager.target(s))))
        s_list.append(s_save)
        return roll_out, step_i + 1, s_list, a_list, r_list, done
コード例 #10
0
    def roll_out_in_env(self, horizon, mode='train'):
        roll_out = Batch()
        oob = False
        s = self.grid.reset()
        s_list = []
        a_list = []
        r_list = []

        state_seq = []
        action_seq = []

        ultimate_start_bar = self.grid.lls2hls(s['observation'])
        goal_bar = self.grid.lls2hls(s['desired_goal'])

        s_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal'])))
        r_manager = 0.
        if mode == 'train':
            if self.total_steps < self.start_time_steps:
                sub_goal = np.concatenate((0.08 * np.random.random((2, )) - 0.04, 2 * math.pi * np.random.random((1, )) - math.pi))
            else:
                sub_goal = (self.manager_policy.select_action(s_manager) +
                            np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)).\
                    clip(-self.max_action, self.max_action)
        else:
            sub_goal = self.manager_policy.select_action(s_manager)

        self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal)

        for step_i in range(horizon):

            if mode == 'train':
                self.total_steps += 1

            manager_target = self.manager.target(s['observation'])
            s_save = copy.deepcopy(
                np.array(list(manager_target[:2]) + list(s['observation'][2:4]) + list(manager_target[2:4])))
            s_list.append(s_save)

            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            state_seq.append(copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) +
                                                    self.manager.target_ang(s['observation']))))

            action_seq.append(a)

            s_new, r, d, info = self.grid.env.step(a)
            info = info["is_success"]
            r = self.manager.reward(s_new['observation'])
            a_list.append(a)
            r_list.append(r)
            r_manager += float(info)

            manager_update = (step_i + 1) % self.manager_time_scale == 0

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if ((step_i + 1 == horizon) or info or oob or manager_update) else 1],
                      [not info],
                      [1.0]))

            s = s_new

            if manager_update or info:
                self.total_steps += 1
                s_new_manager = copy.deepcopy(np.concatenate((s['observation'], s['desired_goal'])))

                if mode == 'train':
                    self.replay_buffer.add(s_manager, self.manager.a, s_new_manager, r_manager, info,
                                           np.array(state_seq), np.array(action_seq))

                    s_manager_her = np.concatenate((s_manager[:6], s['observation']))
                    s_new_manager_her = np.concatenate((s_new_manager[:6], s['observation']))
                    self.replay_buffer.add(s_manager_her, self.manager.a, s_new_manager_her, 1.0, True,
                                           np.array(state_seq), np.array(action_seq))

                    state_seq = []
                    action_seq = []

                s_manager = s_new_manager
                r_manager = 0.

                if mode == 'train':
                    if self.total_steps < self.start_time_steps:
                        sub_goal = np.concatenate(
                            (0.08 * np.random.random((2,)) - 0.04, 2 * math.pi * np.random.random((1,)) - math.pi))
                    else:
                        sub_goal = (self.manager_policy.select_action(s_manager) +
                                    np.random.normal(0, self.max_action * self.expl_noise, size=self.action_dim)). \
                            clip(-self.max_action, self.max_action)
                else:
                    sub_goal = self.manager_policy.select_action(s_manager)

                self.manager.update(s=copy.deepcopy(s['observation']), sg=sub_goal)

            if info or oob:
                break
        manager_target = self.manager.target(s['observation'])
        s_save = copy.deepcopy(np.array(list(manager_target[:2]) + list(s['observation'][2:4]) +
                                        list(manager_target[2:4])))
        s_list.append(s_save)
        return roll_out, step_i + 1, s_list, a_list, r_list, info, ultimate_start_bar, goal_bar
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:

                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()
                s_start = self.grid.lls2hls(s_init['observation'])
                s_goal = self.grid.lls2hls(s_init['desired_goal'])
                self.episode_steps = 0

                """ IMAGE INPUT """
                image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
                image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
                image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
                image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0
                image = torch.from_numpy(image).float().cuda()

                with torch.no_grad():
                    v = self.mvprop_optimizer.mvprop(image)
                    v = v.cpu().detach()

                """ START THE EPISODE """
                horizon_left = self.max_iter
                success = False

                s_bar = self.grid.lls2hls(s_init['observation'])
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(-1. * self.dqn_steps / 10000)
                    s_bar = self.grid.lls2hls(s_init['observation'])
                    hl_s_list.append(s_bar)

                    if torch.rand(1)[0] > self.eps:
                        with torch.no_grad():
                            options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                         s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                         s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                         s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1]
                            options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                         s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                         s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                         s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1]
                            options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                                         2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
                                         5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7]
                            options_x = [self.bound_x(e) for e in options_x]
                            options_y = [self.bound_y(e) for e in options_y]
                            v_options = v[0, 0, options_x, options_y, options_z]
                            option = np.argmax(v_options)
                    else:
                        option = randint(0, 71)

                    option_o = np.floor(option / 9)
                    option_p = option % 9

                    if option_p == 0:
                        target_p = (s_bar[0], s_bar[1])
                    elif option_p == 1:
                        target_p = (s_bar[0], s_bar[1] + 1)
                    elif option_p == 2:
                        target_p = (s_bar[0] + 1, s_bar[1] + 1)
                    elif option_p == 3:
                        target_p = (s_bar[0] + 1, s_bar[1])
                    elif option_p == 4:
                        target_p = (s_bar[0] + 1, s_bar[1] - 1)
                    elif option_p == 5:
                        target_p = (s_bar[0], s_bar[1] - 1)
                    elif option_p == 6:
                        target_p = (s_bar[0] - 1, s_bar[1] - 1)
                    elif option_p == 7:
                        target_p = (s_bar[0] - 1, s_bar[1])
                    elif option_p == 8:
                        target_p = (s_bar[0] - 1, s_bar[1] + 1)
                    target_p = (max(0, min(target_p[0], self.grid.x_size - 1)),
                                max(0, min(target_p[1], self.grid.y_size - 1)))
                    target = (target_p[0], target_p[1], int(option_o))

                    roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                        horizon=self.time_scale,
                        start=s_init,
                        target=target)

                    s_init = l_state

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(option)

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state['observation'])
                    success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2]))
                    if success_tile:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                    if oob:
                        break

                total_success += success
                j += 1

                ### ADD TRANSITIONS TO BUFFER
                for ep_idx in range(len(hl_a_list)):

                    self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hl_r_list[ep_idx],
                                    hl_d_list[ep_idx], image)

                    if True:
                        ##### GET THE HINDSIGHT GOAL TRANSITION
                        image_her = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
                        image_her[0, 1, hl_s_list[-1][0], hl_s_list[-1][1], hl_s_list[-1][2]] = 0
                        image_her = torch.from_numpy(image_her).float().cuda()

                        if (hl_s_list[ep_idx + 1][0] == hl_s_list[-1][0]) and \
                                (hl_s_list[ep_idx + 1][1] == hl_s_list[-1][1]) and \
                                (hl_s_list[ep_idx + 1][2] == hl_s_list[-1][2]):
                            hgt_reward = 0
                            hgt_done = True
                        else:
                            hgt_reward = -1
                            hgt_done = False

                        self.memory.add(hl_s_list[ep_idx], hl_a_list[ep_idx], hl_s_list[ep_idx + 1], hgt_reward,
                                        hgt_done, image_her)

                ### OPTIMIZE NETWORK PARAMETERS
                for _ in range(40):
                    self.mvprop_optimizer.train(self.max_iter / self.time_scale)

                # TARGET NET UPDATE
                if self.dqn_steps % 1 == 0:
                    tau = 0.05
                    for param, target_param in zip(self.mvprop_optimizer.mvprop.parameters(),
                                                   self.mvprop_optimizer.target_mvprop.parameters()):
                        target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:

            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()
            s_start = self.grid.lls2hls(s_init['observation'])
            s_goal = self.grid.lls2hls(s_init['desired_goal'])
            self.episode_steps = 0

            """ IMAGE INPUT """
            image = np.zeros((1, 2, self.grid.x_size, self.grid.y_size, 8))
            image[0, 0, :, :, :] = np.ones((self.grid.x_size, self.grid.y_size, 8))
            image[0, 1, :, :, :] = -1 * np.ones((self.grid.x_size, self.grid.y_size, 8))
            image[0, 1, s_goal[0], s_goal[1], s_goal[2]] = 0
            image = torch.from_numpy(image).float().cuda()

            with torch.no_grad():
                v = self.mvprop_optimizer.target_mvprop(image)
                v = v.cpu().detach()

            """ START THE EPISODE """
            horizon_left = self.max_iter
            success = False

            s_bar = self.grid.lls2hls(s_init['observation'])
            hl_s_list = []
            hl_a_list = []
            hl_r_list = []
            hl_d_list = []
            hl_s_list.append(s_bar)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                s_bar = self.grid.lls2hls(s_init['observation'])
                hl_s_list.append(s_bar)

                with torch.no_grad():
                    options_x = [s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1,
                                 s_bar[0], s_bar[0], s_bar[0] + 1, s_bar[0] + 1, s_bar[0] + 1, s_bar[0],
                                 s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1, s_bar[0], s_bar[0], s_bar[0] + 1,
                                 s_bar[0] + 1, s_bar[0] + 1, s_bar[0], s_bar[0] - 1, s_bar[0] - 1, s_bar[0] - 1]
                    options_y = [s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1,
                                 s_bar[1], s_bar[1] + 1, s_bar[1] + 1, s_bar[1], s_bar[1] - 1, s_bar[1] - 1,
                                 s_bar[1] - 1, s_bar[1], s_bar[1] + 1, s_bar[1], s_bar[1] + 1, s_bar[1] + 1,
                                 s_bar[1], s_bar[1] - 1, s_bar[1] - 1, s_bar[1] - 1, s_bar[1], s_bar[1] + 1]
                    options_z = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
                                 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
                                 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7]
                    options_x = [self.bound_x(e) for e in options_x]
                    options_y = [self.bound_y(e) for e in options_y]
                    v_options = v[0, 0, options_x, options_y, options_z]
                    option = np.argmax(v_options)

                option_o = np.floor(option / 9)
                option_p = option % 9

                if option_p == 0:
                    target_p = (s_bar[0], s_bar[1])
                elif option_p == 1:
                    target_p = (s_bar[0], s_bar[1] + 1)
                elif option_p == 2:
                    target_p = (s_bar[0] + 1, s_bar[1] + 1)
                elif option_p == 3:
                    target_p = (s_bar[0] + 1, s_bar[1])
                elif option_p == 4:
                    target_p = (s_bar[0] + 1, s_bar[1] - 1)
                elif option_p == 5:
                    target_p = (s_bar[0], s_bar[1] - 1)
                elif option_p == 6:
                    target_p = (s_bar[0] - 1, s_bar[1] - 1)
                elif option_p == 7:
                    target_p = (s_bar[0] - 1, s_bar[1])
                elif option_p == 8:
                    target_p = (s_bar[0] - 1, s_bar[1] + 1)
                target_p = (max(0, min(target_p[0], self.grid.x_size - 1)),
                            max(0, min(target_p[1], self.grid.y_size - 1)))
                target = (target_p[0], target_p[1], int(option_o))

                roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                    horizon=self.time_scale,
                    start=s_init,
                    target=target)

                s_init = l_state

                hl_s_list.append(s_bar_p)
                hl_a_list.append(option)

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state['observation'])
                success_tile = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]) and (st_bar[2] == s_goal[2]))
                if success_tile:
                    hl_r_list.append(0)
                    hl_d_list.append(True)
                else:
                    hl_r_list.append(-1)
                    hl_d_list.append(False)

                if oob:
                    break

            j = 1.

            return success
    def roll_out_in_env(self, start, target, horizon):
        roll_out = Batch()
        s = start
        s_list = []
        a_list = []
        r_list = []

        d = False
        oob = False
        break_var = False
        target_vec = target_2_target_vec(target, s['observation'])
        goal = self.grid.lls2hls(s['observation'] + np.array(list(target_vec[:2]) + [0, 0] + list(target_vec[2:4])))

        for step_i in range(horizon):
            self.episode_steps += 1

            target_vec = target_2_target_vec(target, s['observation'])
            s_save = copy.deepcopy(np.array(list(target_vec[:2]) + list(s['observation'][2:4]) + list(target_vec[2:4])))

            s_list.append(copy.deepcopy(s['observation']))
            s_tensor = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s_tensor)

            s_new, r, d, info = self.grid.env.step(a)
            s_new_bar = self.grid.lls2hls(s_new['observation'])
            success_var = info["is_success"]
            info = not info["is_success"]

            d = (s_new_bar[0] == goal[0]) and (s_new_bar[1] == goal[1]) and (s_new_bar[2] == goal[2])
            r = 0.0
            if d:
                r = 1.0
                info = False

            if success_var:
                info = False
                break_var = True

            break_var = break_var or (not info) or (step_i + 1 == horizon) or (self.episode_steps == self.max_iter)

            ib = self.grid.check_in_bounds(s_new['observation'])
            if not ib:
                oob = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)],
                      [s_save.astype(np.float32)],
                      [r],
                      [s_new['observation'].astype(np.float32)],
                      [0 if (break_var or oob) else 1],
                      [info],
                      [1.0]))

            s = s_new
            if break_var or oob:
                break

        s_list.append(copy.deepcopy(s['observation']))

        return roll_out, s_list, a_list, r_list, d, success_var, s_new, s_new_bar, oob
コード例 #13
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                s_init = self.grid.reset()
                s_start = self.grid.lls2hls(s_init['observation'])
                s_goal = self.grid.lls2hls(s_init['desired_goal'])
                self.episode_steps = 0
                """ V MAP """
                if self.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=False)
                else:
                    self.vi.update_p_table(
                        occupancy_map=self.grid.occupancy_map, walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1], s_goal[2]))
                """ START THE EPISODE """
                horizon_left = self.max_iter
                success = False

                hl_s_list = []
                hl_s_new_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)

                    s_bar = self.grid.lls2hls(s_init['observation'])
                    hl_s_list.append(s_bar)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = pi[s_bar[0], s_bar[1], s_bar[2]]
                    else:
                        a_bar = (randint(0, 7), randint(0, 7))

                    self.vi.set_target(s_bar, a_bar)

                    roll_out, _, _, _, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                        horizon=self.time_scale, start=s_init)

                    hl_s_new_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                    if oob:
                        break

                total_success += success
                j += 1

                if not self.optimistic_model:
                    x_temp, y_temp = self.vi.generate_dataset_flat(
                        hl_s_list, hl_a_list, hl_s_new_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40)

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            s_init = self.grid.reset()
            s_start = self.grid.lls2hls(s_init['observation'])
            s_goal = self.grid.lls2hls(s_init['desired_goal'])
            self.episode_steps = 0
            """ V MAP """
            if self.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=False)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.occupancy_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid,
                                   goal=(s_goal[0], s_goal[1], s_goal[2]))
            """ START THE EPISODE """
            horizon_left = self.max_iter
            success = False

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                s_bar = self.grid.lls2hls(s_init['observation'])
                a_bar = pi[s_bar[0], s_bar[1], s_bar[2]]

                self.vi.set_target(s_bar, a_bar)

                roll_out, states, actions, rewards, wp_success, success, l_state, s_bar_p, oob = self.roll_out_in_env(
                    horizon=self.time_scale, start=s_init)

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                if oob:
                    break

            return success
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:

                """ INITIALIZE THE ENVIRONMENT """
                self.grid.reset_env_terrain()
                start_pos = self.grid.sample_random_start_terrain(number=1)[0]
                goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos, goal_pos)
                self.episode_steps = 0

                path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
                self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                success = False

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    curr_goal = \
                        self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=horizon_left,
                        ultimate_goal=goal_pos,
                        mode='train'
                    )

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

                    batch.append(roll_out)

                total_success += success
                j += 1

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            self.grid.reset_env_terrain()
            start_pos = self.grid.sample_random_start_terrain(number=1)[0]
            goal_pos = self.grid.sample_random_goal_terrain(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos, goal_pos)
            self.episode_steps = 0

            path_segment_len_list = self.grid.goal_management.reset(start_pos, goal_pos, self.grid)
            self.grid.old_distance = self.grid.goal_management.path_segment_len_list[0]

            """ START THE EPISODE """
            horizon_left = self.config.time_horizon
            success = False

            while (horizon_left > 0) and not success:
                # GET THE TARGET VECTOR
                curr_goal = \
                    self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=horizon_left,
                    ultimate_goal=goal_pos,
                    mode='test'
                )

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0]) and (st_bar[1] == s_goal[1]))

            return success
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
                structure = copy.deepcopy(a)
                objects = copy.deepcopy(b)
                obstacle_list = copy.deepcopy(c)
                occupancy_map = copy.deepcopy(d)
                default_starts = copy.deepcopy(e)
                state_cache = copy.deepcopy(f)
                occupancy_map_padded = copy.deepcopy(g)
                occupancy_map_un_padded = copy.deepcopy(h)
                occupancy_map_original = copy.deepcopy(p)

                self.grid = Ant44Env0(
                    mj_ant_path=self.mj_ant_path,
                    re_init=True,
                    structure=structure,
                    objects=objects,
                    obstacle_list=obstacle_list,
                    occupancy_map=occupancy_map,
                    default_starts=default_starts,
                    state_cache=state_cache,
                    occupancy_map_padded=occupancy_map_padded,
                    occupancy_map_un_padded=occupancy_map_un_padded,
                    occupancy_map_original=occupancy_map_original,
                    waypoint=True)

                start_pos = self.grid.sample_random_pos(number=1)[0]
                goal_pos = self.grid.sample_random_pos(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                s_init = self.grid.reset(start_pos)

                horizon_left = self.max_iter
                success = False

                path_segment_len_list = self.grid.goal_management.reset(
                    start_pos, goal_pos, self.grid)
                self.grid.old_distance = self.grid.goal_management.path_segment_len_list[
                    0]

                while (horizon_left > 0) and not success:

                    curr_goal = \
                        self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                    roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        ultimate_goal=goal_pos,
                        horizon=horizon_left)

                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))

                    batch.append(roll_out)

                total_success += success
                j += 1

                ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
                tree = ET.parse(ant_path)
                tree.write(self.grid.mj_ant_path + 'ant.xml')

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
            structure = copy.deepcopy(a)
            objects = copy.deepcopy(b)
            obstacle_list = copy.deepcopy(c)
            occupancy_map = copy.deepcopy(d)
            default_starts = copy.deepcopy(e)
            state_cache = copy.deepcopy(f)
            occupancy_map_padded = copy.deepcopy(g)
            occupancy_map_un_padded = copy.deepcopy(h)
            occupancy_map_original = copy.deepcopy(p)

            self.grid = Ant44Env0(
                mj_ant_path=self.mj_ant_path,
                re_init=True,
                structure=structure,
                objects=objects,
                obstacle_list=obstacle_list,
                occupancy_map=occupancy_map,
                default_starts=default_starts,
                state_cache=state_cache,
                occupancy_map_padded=occupancy_map_padded,
                occupancy_map_un_padded=occupancy_map_un_padded,
                occupancy_map_original=occupancy_map_original,
                waypoint=True)

            start_pos = self.grid.sample_random_pos(number=1)[0]
            goal_pos = self.grid.sample_random_pos(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            s_init = self.grid.reset(start_pos)

            horizon_left = self.max_iter
            success = False

            path_segment_len_list = self.grid.goal_management.reset(
                start_pos, goal_pos, self.grid)
            self.grid.old_distance = self.grid.goal_management.path_segment_len_list[
                0]

            while (horizon_left > 0) and not success:

                curr_goal = \
                    self.grid.goal_management.way_points[self.grid.goal_management.way_point_current]

                roll_out, states, actions, rewards, wp_success, l_state = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    ultimate_goal=goal_pos,
                    horizon=horizon_left)

                s_init = l_state

                num_roll_outs += 1
                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                total_wp_success += wp_success
                jwp += 1

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

                batch.append(roll_out)

            total_success += success
            j += 1

            ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
            tree = ET.parse(ant_path)
            tree.write(self.grid.mj_ant_path + 'ant.xml')

            return success
    def roll_out_in_env(self, start, goal, ultimate_goal, horizon):
        roll_out = Batch()
        s_u_goal = self.grid.lls2hls(ultimate_goal)
        s = start

        s_list = []
        a_list = []
        r_list = []
        d = False

        for step_i in range(horizon):
            s_bar = self.grid.lls2hls(s)
            target_vec = self.grid.goal_management.get_target_vec(s[:2])

            s_save = copy.deepcopy(
                np.array(
                    list(s[2:]) +
                    list(self.grid.state_cache[s_bar[0], s_bar[1]]) +
                    list(target_vec)))
            s_pos_save = copy.deepcopy(np.array(s[:2]))
            s_list.append(np.concatenate((s_pos_save, s_save)))
            s = torch.tensor(s_save, dtype=torch.float).unsqueeze(0)
            a = self.policy.select_action(s)

            s_new, r, d, info = self.grid.step(a, goal)
            info = info['no_goal_reached']

            dist_wp = np.sqrt((s_new[0] - self.grid.goal_management.way_points[
                self.grid.goal_management.way_point_current][0])**2 + (
                    s_new[1] - self.grid.goal_management.way_points[
                        self.grid.goal_management.way_point_current][1])**2)
            d_wp = dist_wp < 0.5
            new_distance = dist_wp
            d = np.sqrt((s_new[0] - goal[0])**2 +
                        (s_new[1] - goal[1])**2) < 0.5
            if d:
                info = False
            else:
                info = True
            r = d_wp  # + self.grid.old_distance - 0.99 * new_distance
            self.grid.old_distance = new_distance

            s_bar_cand = self.grid.lls2hls(s_new)

            break_var = self.grid.goal_management.update_way_point(
                self.grid, (s_new[0], s_new[1]), d)
            success_var = ((s_bar_cand[0] == s_u_goal[0])
                           and (s_bar_cand[1] == s_u_goal[1]))
            if success_var:
                info = False
                break_var = True

            a_list.append(a)
            r_list.append(r)
            roll_out.append(
                Batch([a.astype(np.float32)], [s_save.astype(np.float32)], [r],
                      [s_new.astype(np.float32)], [
                          0 if ((not info) or
                                (step_i + 1 == horizon) or break_var) else 1
                      ], [info], [1.0]))
            s = s_new
            if (not info) or break_var:
                break
        return roll_out, s_list, a_list, r_list, d, s_new
コード例 #17
0
    def simulate_env(self, mode):
        batch = Batch()
        num_roll_outs = 0
        num_steps = 0
        total_success = 0
        total_wp_success = 0
        j = 0.
        jwp = 0.

        if mode == 'train':

            while num_steps < self.config.policy_batch_size:
                """ INITIALIZE THE ENVIRONMENT """
                a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
                structure = copy.deepcopy(a)
                objects = copy.deepcopy(b)
                obstacle_list = copy.deepcopy(c)
                occupancy_map = copy.deepcopy(d)
                default_starts = copy.deepcopy(e)
                state_cache = copy.deepcopy(f)
                occupancy_map_padded = copy.deepcopy(g)
                occupancy_map_un_padded = copy.deepcopy(h)
                occupancy_map_original = copy.deepcopy(p)

                self.grid = Ant44Env0(
                    mj_ant_path=self.config.mj_ant_path,
                    re_init=True,
                    structure=structure,
                    objects=objects,
                    obstacle_list=obstacle_list,
                    occupancy_map=occupancy_map,
                    default_starts=default_starts,
                    state_cache=state_cache,
                    occupancy_map_padded=occupancy_map_padded,
                    occupancy_map_un_padded=occupancy_map_un_padded,
                    occupancy_map_original=occupancy_map_original)

                start_pos = self.grid.sample_random_pos(number=1)[0]
                goal_pos = self.grid.sample_random_pos(number=1)[0]
                s_goal = self.grid.lls2hls(goal_pos)
                self.episode_steps = 0
                s_init = self.grid.reset(start_pos)
                """ V MAP """
                if self.config.optimistic_model:
                    self.vi.update_p_table_optimistic(
                        occupancy_map=self.grid.occupancy_map, walls=True)
                else:
                    self.vi.update_p_table(
                        occupancy_map=self.grid.occupancy_map, walls=True)

                v, pi = self.vi.run_vi(grid=self.grid,
                                       goal=(s_goal[0], s_goal[1]))
                """ START THE EPISODE """
                horizon_left = self.config.time_horizon
                st = start_pos
                success = False

                s_bar = self.grid.lls2hls(st)
                hl_s_list = []
                hl_a_list = []
                hl_r_list = []
                hl_d_list = []
                hl_s_list.append(s_bar)

                while (horizon_left > 0) and not success:

                    # GET THE TARGET VECTOR
                    self.dqn_steps += 1
                    self.eps = 0.01 + 0.99 * math.exp(
                        -1. * self.dqn_steps / 10000)
                    s_bar = self.grid.lls2hls(st)

                    if torch.rand(1)[0] > self.eps:
                        a_bar = int(pi[s_bar[0], s_bar[1]])
                    else:
                        a_bar = randint(0, 7)

                    self.vi.set_target(s_bar, a_bar)
                    curr_goal = self.vi.get_target()

                    roll_out, _, _, _, wp_success, l_state, s_bar_p = self.roll_out_in_env(
                        start=s_init,
                        goal=curr_goal,
                        horizon=self.time_scale,
                        ultimate_goal=goal_pos)

                    hl_s_list.append(s_bar_p)
                    hl_a_list.append(a_bar)

                    st = l_state[:2]
                    s_init = l_state

                    num_roll_outs += 1
                    num_steps += roll_out.length()
                    horizon_left -= roll_out.length()

                    total_wp_success += wp_success
                    jwp += 1

                    st_bar = self.grid.lls2hls(l_state)
                    success = ((st_bar[0] == s_goal[0])
                               and (st_bar[1] == s_goal[1]))
                    if success:
                        hl_r_list.append(0)
                        hl_d_list.append(True)
                    else:
                        hl_r_list.append(-1)
                        hl_d_list.append(False)

                    batch.append(roll_out)

                total_success += success
                j += 1

                if not self.config.optimistic_model:
                    x_temp, y_temp, w_temp = self.vi.generate_dataset_flat(
                        self.grid.occupancy_map, hl_s_list, hl_a_list)

                    for bi in range(x_temp.shape[0]):
                        self.buffer.add(x_temp[bi], y_temp[bi], w_temp[bi])

                    self.vi.train_net(buffer=self.buffer,
                                      bs=128,
                                      opt_iterations=40,
                                      rw=True)

                ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
                tree = ET.parse(ant_path)
                tree.write(self.grid.mj_ant_path + 'ant.xml')

            return batch, total_success / j, total_wp_success / jwp, num_steps / j, num_steps / num_roll_outs

        else:
            """ INITIALIZE THE ENVIRONMENT """
            a, b, c, d, e, f, g, h, p = self.grid.reset_env_random()
            structure = copy.deepcopy(a)
            objects = copy.deepcopy(b)
            obstacle_list = copy.deepcopy(c)
            occupancy_map = copy.deepcopy(d)
            default_starts = copy.deepcopy(e)
            state_cache = copy.deepcopy(f)
            occupancy_map_padded = copy.deepcopy(g)
            occupancy_map_un_padded = copy.deepcopy(h)
            occupancy_map_original = copy.deepcopy(p)

            self.grid = Ant44Env0(
                mj_ant_path=self.config.mj_ant_path,
                re_init=True,
                structure=structure,
                objects=objects,
                obstacle_list=obstacle_list,
                occupancy_map=occupancy_map,
                default_starts=default_starts,
                state_cache=state_cache,
                occupancy_map_padded=occupancy_map_padded,
                occupancy_map_un_padded=occupancy_map_un_padded,
                occupancy_map_original=occupancy_map_original)

            start_pos = self.grid.sample_random_pos(number=1)[0]
            goal_pos = self.grid.sample_random_pos(number=1)[0]
            s_goal = self.grid.lls2hls(goal_pos)
            self.episode_steps = 0
            s_init = self.grid.reset(start_pos)
            """ V MAP """
            if self.config.optimistic_model:
                self.vi.update_p_table_optimistic(
                    occupancy_map=self.grid.occupancy_map, walls=True)
            else:
                self.vi.update_p_table(occupancy_map=self.grid.occupancy_map,
                                       walls=True)

            v, pi = self.vi.run_vi(grid=self.grid, goal=(s_goal[0], s_goal[1]))

            horizon_left = self.config.time_horizon
            st = start_pos
            success = False

            s_bar = self.grid.lls2hls(st)

            while (horizon_left > 0) and not success:

                # GET THE TARGET VECTOR
                a_bar = int(pi[s_bar[0], s_bar[1]])
                self.vi.set_target(s_bar, a_bar)
                curr_goal = self.vi.get_target()

                roll_out, states, actions, rewards, wp_success, l_state, _ = self.roll_out_in_env(
                    start=s_init,
                    goal=curr_goal,
                    horizon=self.time_scale,
                    ultimate_goal=goal_pos)

                st = l_state[:2]
                s_bar = self.grid.lls2hls(st)
                s_init = l_state

                num_steps += roll_out.length()
                horizon_left -= roll_out.length()

                st_bar = self.grid.lls2hls(l_state)
                success = ((st_bar[0] == s_goal[0])
                           and (st_bar[1] == s_goal[1]))

            ant_path = self.grid.mj_ant_path + 'ant_copy.xml'
            tree = ET.parse(ant_path)
            tree.write(self.grid.mj_ant_path + 'ant.xml')

            return success