Exemple #1
0
    def __init__(self, params):
        self.buffer = Buffer()
        self.close_buff, self.far_buff = Buffer(), Buffer()
        self.batch_num = 0

        self._vis_dirs = deque([])

        self.num_trajectories_to_process = 20
        self.complete_trajectories = deque([])

        self.params = params
Exemple #2
0
    def test_dist_training(self):
        t = Timing()

        def make_env():
            return make_doom_env(doom_env_by_name(TEST_ENV_NAME))

        params = AgentTMAX.Params('__test_dist_train__')
        params.distance_target_buffer_size = 1000

        with t.timeit('generate_data'):
            # first: generate fake random data
            buffer = Buffer()

            obs1 = np.full([84, 84, 3], 0, dtype=np.uint8)
            obs1[:, :, 1] = 255
            obs2 = np.full([84, 84, 3], 0, dtype=np.uint8)
            obs2[:, :, 2] = 255

            data_size = params.distance_target_buffer_size
            for i in range(data_size):
                same = i % 2 == 0
                if same:
                    if random.random() < 0.5:
                        obs_first = obs_second = obs1
                    else:
                        obs_first = obs_second = obs2
                else:
                    obs_first, obs_second = obs1, obs2
                    if random.random() < 0.5:
                        obs_second, obs_first = obs_first, obs_second

                buffer.add(obs_first=obs_first,
                           obs_second=obs_second,
                           labels=0 if same else 1)

        with t.timeit('init'):
            agent = AgentTMAX(make_env, params)
            agent.initialize()

            params.distance_train_epochs = 1
            params.distance_batch_size = 256
            agent.distance.train(buffer, 1, agent)

        with t.timeit('train'):
            params.distance_train_epochs = 2
            params.distance_batch_size = 64
            agent.distance.train(buffer, 1, agent, t)

        agent.finalize()

        log.info('Timing: %s', t)
        shutil.rmtree(params.experiment_dir())
    def test_buffer_shuffle(self):
        b = Buffer()
        b.add_many(a=np.arange(10000), b=np.arange(10000))

        for i in range(5):
            self.assertTrue(np.array_equal(b.a, b.b))
            b.shuffle_data()
Exemple #4
0
class DistanceBuffer:
    """Training data for the distance network (observation pairs and labels)."""
    def __init__(self, params):
        self.buffer = Buffer()
        self.close_buff, self.far_buff = Buffer(), Buffer()
        self.batch_num = 0

        self._vis_dirs = deque([])

        self.num_trajectories_to_process = 20
        self.complete_trajectories = deque([])

        self.params = params

    def extract_data(self, trajectories):
        timing = Timing()

        if len(self.buffer) > self.params.distance_target_buffer_size:
            # already enough data
            return

        close, far = self.params.close_threshold, self.params.far_threshold

        num_close, num_far = 0, 0
        data_added = 0

        with timing.timeit('trajectories'):
            for trajectory in trajectories:
                check_tmax = isinstance(trajectory, TmaxTrajectory)

                obs = trajectory.obs

                indices = list(range(len(trajectory)))
                np.random.shuffle(indices)

                for i in indices:
                    if len(self.buffer
                           ) > self.params.distance_target_buffer_size // 2:
                        if data_added > self.params.distance_target_buffer_size // 4:  # to limit memory usage
                            break

                    if len(self.buffer
                           ) > self.params.distance_target_buffer_size:
                        break

                    close_i = min(i + close, len(trajectory))
                    far_i = min(i + far, len(trajectory))

                    # sample close observation pair
                    first_idx = i
                    second_idx = np.random.randint(i, close_i)

                    # in TMAX we do some additional checks
                    add_close = True
                    if check_tmax:
                        both_frames_random = trajectory.is_random[
                            first_idx] and trajectory.is_random[second_idx]
                        first_exploration = trajectory.mode[
                            first_idx] == TmaxMode.EXPLORATION
                        second_exploration = trajectory.mode[
                            second_idx] == TmaxMode.EXPLORATION
                        if both_frames_random or (first_exploration
                                                  and second_exploration):
                            add_close = True
                        else:
                            add_close = False

                    if add_close:
                        if self.params.distance_symmetric and random.random(
                        ) < 0.5:
                            first_idx, second_idx = second_idx, first_idx

                        self.buffer.add(obs_first=obs[first_idx],
                                        obs_second=obs[second_idx],
                                        labels=0)
                        data_added += 1
                        num_close += 1

                    # sample far observation pair
                    if far_i < len(trajectory):
                        first_idx = i
                        second_idx = np.random.randint(far_i, len(trajectory))

                        add_far = True
                        if check_tmax:
                            both_frames_random = trajectory.is_random[
                                first_idx] and trajectory.is_random[second_idx]
                            first_exploration = trajectory.mode[
                                first_idx] == TmaxMode.EXPLORATION
                            second_exploration = trajectory.mode[
                                second_idx] == TmaxMode.EXPLORATION
                            if both_frames_random or (first_exploration
                                                      and second_exploration):
                                add_far = True
                            else:
                                add_far = False

                        if add_far:
                            if self.params.distance_symmetric and random.random(
                            ) < 0.5:
                                first_idx, second_idx = second_idx, first_idx

                            self.buffer.add(obs_first=obs[first_idx],
                                            obs_second=obs[second_idx],
                                            labels=1)
                            data_added += 1
                            num_far += 1

        with timing.timeit('finalize'):
            self.buffer.trim_at(self.params.distance_target_buffer_size)

        if self.batch_num % 20 == 0:
            with timing.timeit('visualize'):
                self._visualize_data()

        self.batch_num += 1
        log.info('num close %d, num far %d, distance net timing %s', num_close,
                 num_far, timing)

    def has_enough_data(self):
        len_data, min_data = len(
            self.buffer), self.params.distance_target_buffer_size // 3
        if len_data < min_data:
            log.info('Need to gather more data to train distance net, %d/%d',
                     len_data, min_data)
            return False
        return True

    def shuffle_data(self):
        self.buffer.shuffle_data()

    def reset(self):
        self.buffer.clear()

    def _visualize_data(self):
        min_vis = 10
        if len(self.buffer) < min_vis:
            return

        close_examples, far_examples = [], []
        labels = self.buffer.labels
        obs_first, obs_second = self.buffer.obs_first, self.buffer.obs_second

        for i in range(len(labels)):
            if labels[i] == 0 and len(close_examples) < min_vis:
                close_examples.append((obs_first[i], obs_second[i]))
            elif labels[i] == 1 and len(far_examples) < min_vis:
                far_examples.append((obs_first[i], obs_second[i]))

        if len(close_examples) < min_vis or len(far_examples) < min_vis:
            return

        img_folder = vis_dir(self.params.experiment_dir())
        img_folder = ensure_dir_exists(join(img_folder, 'dist'))
        img_folder = ensure_dir_exists(join(img_folder, f'dist_{time.time()}'))

        def save_images(examples, close_or_far):
            for visualize_i in range(len(examples)):
                img_first_name = join(
                    img_folder, f'{close_or_far}_{visualize_i}_first.png')
                img_second_name = join(
                    img_folder, f'{close_or_far}_{visualize_i}_second.png')
                cv2.imwrite(img_first_name, examples[visualize_i][0])
                cv2.imwrite(img_second_name, examples[visualize_i][1])

        save_images(close_examples, 'close')
        save_images(far_examples, 'far')

        self._vis_dirs.append(img_folder)
        while len(self._vis_dirs) > 20:
            dir_name = self._vis_dirs.popleft()
            if os.path.isdir(dir_name):
                shutil.rmtree(dir_name)
 def __init__(self, params):
     self.params = params
     self.batch_num = 0
     self.buffer = Buffer()
     self._vis_dirs = deque([])
class LocomotionBuffer:
    """
    Training data for the hindsight experience replay (for locomotion policy).
    """
    def __init__(self, params):
        self.params = params
        self.batch_num = 0
        self.buffer = Buffer()
        self._vis_dirs = deque([])

    def extract_data(self, trajectories):
        timing = Timing()

        if len(trajectories) <= 0:
            return

        if len(self.buffer) > self.params.locomotion_experience_replay_buffer:
            return

        with timing.timeit('trajectories'):
            max_trajectory = self.params.locomotion_max_trajectory

            data_so_far = 0

            trajectories = [
                t for t in trajectories
                if len(t) > self.params.locomotion_max_trajectory
            ]

            # train only on random frames
            random_frames = [[
                i for i, is_random in enumerate(t.is_random) if is_random
            ] for t in trajectories]

            total_experience = sum(len(frames) for frames in random_frames)
            max_total_experience = 0.75 * total_experience  # max fraction of experience to use
            max_num_segments = int(max_total_experience / max_trajectory)

            log.info(
                '%d total experience from %d trajectories (%d segments)',
                max_total_experience,
                len(trajectories),
                max_num_segments,
            )

            attempts = 0

            while data_so_far < max_total_experience:
                attempts += 1
                if attempts > 100 * max_total_experience:  # just in case
                    break

                trajectory_idx = random.choice(range(len(trajectories)))
                trajectory = trajectories[trajectory_idx]
                if len(random_frames[trajectory_idx]) <= max_trajectory:
                    continue

                first_random_frame = random_frames[trajectory_idx][0]
                if len(trajectory) - first_random_frame < max_trajectory:
                    continue

                # sample random interval in trajectory, treat the last frame as "imaginary" goal, use actions as
                # ground truth
                start_idx = random.randint(first_random_frame,
                                           len(trajectory) - 2)
                goal_idx = min(start_idx + max_trajectory, len(trajectory) - 1)
                assert start_idx < goal_idx

                if not trajectory.is_random[start_idx]:
                    continue
                if not trajectory.is_random[goal_idx]:
                    continue

                for i in range(start_idx, goal_idx):
                    if not trajectory.is_random[i]:
                        continue

                    assert 0 < goal_idx - i <= max_trajectory
                    self.buffer.add(
                        obs_prev=trajectory.obs[max(0, i - 1)],
                        obs_curr=trajectory.obs[i],
                        obs_goal=trajectory.obs[goal_idx],
                        actions=trajectory.actions[i],
                        mode=trajectory.mode[i],
                        diff=goal_idx - i,
                    )
                    data_so_far += 1

                if len(self.buffer
                       ) > self.params.locomotion_experience_replay_buffer:
                    break

        # if self.batch_num % 10 == 0:
        #     with timing.timeit('vis'):
        #         self._visualize_data(training_data)

        # with timing.timeit('finalize'):
        #     for traj_buffer in training_data:
        #         self.buffer.add_buff(traj_buffer)

        # self.shuffle_data()
        # self.buffer.trim_at(self.params.locomotion_experience_replay_buffer)

        self.batch_num += 1
        log.info('Locomotion, buffer size: %d, timing: %s', len(self.buffer),
                 timing)

    def has_enough_data(self):
        len_data, min_data = len(
            self.buffer), self.params.locomotion_experience_replay_buffer // 3
        if len_data < min_data:
            log.info('Need to gather more data to train locomotion net, %d/%d',
                     len_data, min_data)
            return False
        return True

    def shuffle_data(self):
        permutation = self.buffer.shuffle_data(return_permutation=True)
        return permutation

    def reset(self):
        self.buffer.clear()
def train_loop(agent, multi_env):
    params = agent.params

    observations = main_observation(multi_env.reset())
    infos = multi_env.info()

    trajectory_buffer = TrajectoryBuffer(multi_env.num_envs)

    step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps])

    loop_time = deque([], maxlen=2500)
    advanced_steps = deque([], maxlen=2500)

    t = Timing()

    complete_trajectories = []
    num_to_process = 20

    test_buffer = Buffer()
    num_test_data = 5000

    while True:
        with t.timeit('loop'):
            with t.timeit('step'):
                actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs)
                new_obs, rewards, dones, new_infos = multi_env.step(actions)

            with t.timeit('misc'):
                trajectory_buffer.add(observations, actions, infos, dones)

                observations = main_observation(new_obs)
                infos = new_infos

                num_steps_delta = num_env_steps(infos)
                env_steps += num_steps_delta

                complete_trajectories.extend(trajectory_buffer.complete_trajectories)
                trajectory_buffer.reset_trajectories()

            with t.timeit('train'):
                while len(complete_trajectories) > num_to_process:
                    buffer = generate_training_data(complete_trajectories[:num_to_process], params)
                    complete_trajectories = complete_trajectories[num_to_process:]

                    if len(test_buffer) <= 0:
                        buffer.shuffle_data()

                        test_buffer = Buffer()
                        test_buffer.add_buff(buffer, max_to_add=num_test_data)
                    else:
                        step = agent.curiosity.distance.train(buffer, env_steps, agent)

                    agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent)

            if t.train > 1.0:
                log.debug('Training time: %s', t)

        loop_time.append(t.loop)
        advanced_steps.append(num_steps_delta)

        if env_steps % 100 == 0:
            avg_fps = sum(advanced_steps) / sum(loop_time)
            log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def generate_training_data(trajectories, params):
    timing = Timing()
    with timing.timeit('trajectories'):
        close, far = params.close_threshold, params.far_threshold

        trajectory_joined = Trajectory(0)

        trajectory_idx = []
        episode_ends = []
        for i, t in enumerate(trajectories):
            trajectory_joined.add_trajectory(t)
            trajectory_idx.extend([i] * len(t))
            episode_ends.append(len(trajectory_joined))

        obs = trajectory_joined.obs

        indices = list(range(len(trajectory_joined)))
        np.random.shuffle(indices)

        buffer = Buffer()
        num_close, num_far = 0, 0

        for i in indices:
            # sample close observation pair
            close_i = min(i + close, len(trajectory_joined))
            first_idx = i
            second_idx = np.random.randint(i, close_i)

            if trajectory_idx[first_idx] == trajectory_idx[second_idx]:
                if params.distance_symmetric and random.random() < 0.5:
                    first_idx, second_idx = second_idx, first_idx

                buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=0)
                num_close += 1

            # sample far observation pair
            next_episode_end = 0
            for next_episode_end in episode_ends:
                if next_episode_end > i:
                    break

            if random.random() < 0.3:
                max_len = len(trajectory_joined)
            else:
                max_len = next_episode_end

            far_i = min(i + far, max_len)

            if far_i < max_len:
                first_idx = i
                second_idx = np.random.randint(far_i, max_len)
                if params.distance_symmetric and random.random() < 0.5:
                    first_idx, second_idx = second_idx, first_idx

                buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=1)
                num_far += 1

    log.info(
        'Processed %d trajectories, total %d, close %d, far %d, timing: %s',
        len(trajectories), len(buffer), num_close, num_far, timing,
    )

    return buffer
    def test_buffer_performance(self):
        small_buffer = Buffer()
        small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8))

        buffer = Buffer()

        t = Timing()

        with t.timeit('add'):
            for i in range(100):
                buffer.add_buff(small_buffer)

        huge_buffer = Buffer()
        with t.timeit('add_huge'):
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('single_add_small'):
            huge_buffer.add_buff(small_buffer)

        with t.timeit('clear_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(buffer)

        with t.timeit('shuffle_and_add'):
            huge_buffer.clear()
            huge_buffer.add_buff(buffer)
            huge_buffer.add_buff(small_buffer)
            with t.timeit('shuffle'):
                huge_buffer.shuffle_data()

        log.debug('Timing: %s', t)
    def test_buffer(self):
        buff = Buffer()

        buff.add(a=1, b='b', c=None, d=3.14)
        self.assertEqual(len(buff), 1)
        self.assertGreaterEqual(buff._capacity, 1)

        self.assertEqual(buff.a[0], 1)
        self.assertEqual(buff.b[0], 'b')

        buff.add_many(a=[2, 3], b=['c', 'd'], c=[None, list()], d=[2.71, 1.62])
        self.assertEqual(len(buff), 3)
        self.assertGreaterEqual(buff._capacity, 3)

        self.assertTrue(np.array_equal(buff.a, [1, 2, 3]))
        self.assertTrue(np.array_equal(buff.b, ['b', 'c', 'd']))

        buff.trim_at(5)
        self.assertTrue(np.array_equal(buff.a, [1, 2, 3]))

        buff.trim_at(2)
        self.assertTrue(np.array_equal(buff.a, [1, 2]))

        buff.add_many(a=[2, 3], b=['c', 'd'], c=[None, list()], d=[2.71, 1.62])

        buff.shuffle_data()
        buff.shuffle_data()
        buff.shuffle_data()

        buff.trim_at(1)
        self.assertIn(buff.a[0], [1, 2, 3])

        self.assertEqual(len(buff), 1)
        self.assertGreaterEqual(buff._capacity, 4)

        buff_temp = Buffer()
        buff_temp.add(a=10, b='e', c=dict(), d=9.81)

        buff.add_buff(buff_temp)

        self.assertEqual(len(buff), 2)

        buff.clear()
        self.assertEqual(len(buff), 0)