def __init__(self, params): self.buffer = Buffer() self.close_buff, self.far_buff = Buffer(), Buffer() self.batch_num = 0 self._vis_dirs = deque([]) self.num_trajectories_to_process = 20 self.complete_trajectories = deque([]) self.params = params
def test_dist_training(self): t = Timing() def make_env(): return make_doom_env(doom_env_by_name(TEST_ENV_NAME)) params = AgentTMAX.Params('__test_dist_train__') params.distance_target_buffer_size = 1000 with t.timeit('generate_data'): # first: generate fake random data buffer = Buffer() obs1 = np.full([84, 84, 3], 0, dtype=np.uint8) obs1[:, :, 1] = 255 obs2 = np.full([84, 84, 3], 0, dtype=np.uint8) obs2[:, :, 2] = 255 data_size = params.distance_target_buffer_size for i in range(data_size): same = i % 2 == 0 if same: if random.random() < 0.5: obs_first = obs_second = obs1 else: obs_first = obs_second = obs2 else: obs_first, obs_second = obs1, obs2 if random.random() < 0.5: obs_second, obs_first = obs_first, obs_second buffer.add(obs_first=obs_first, obs_second=obs_second, labels=0 if same else 1) with t.timeit('init'): agent = AgentTMAX(make_env, params) agent.initialize() params.distance_train_epochs = 1 params.distance_batch_size = 256 agent.distance.train(buffer, 1, agent) with t.timeit('train'): params.distance_train_epochs = 2 params.distance_batch_size = 64 agent.distance.train(buffer, 1, agent, t) agent.finalize() log.info('Timing: %s', t) shutil.rmtree(params.experiment_dir())
def test_buffer_shuffle(self): b = Buffer() b.add_many(a=np.arange(10000), b=np.arange(10000)) for i in range(5): self.assertTrue(np.array_equal(b.a, b.b)) b.shuffle_data()
class DistanceBuffer: """Training data for the distance network (observation pairs and labels).""" def __init__(self, params): self.buffer = Buffer() self.close_buff, self.far_buff = Buffer(), Buffer() self.batch_num = 0 self._vis_dirs = deque([]) self.num_trajectories_to_process = 20 self.complete_trajectories = deque([]) self.params = params def extract_data(self, trajectories): timing = Timing() if len(self.buffer) > self.params.distance_target_buffer_size: # already enough data return close, far = self.params.close_threshold, self.params.far_threshold num_close, num_far = 0, 0 data_added = 0 with timing.timeit('trajectories'): for trajectory in trajectories: check_tmax = isinstance(trajectory, TmaxTrajectory) obs = trajectory.obs indices = list(range(len(trajectory))) np.random.shuffle(indices) for i in indices: if len(self.buffer ) > self.params.distance_target_buffer_size // 2: if data_added > self.params.distance_target_buffer_size // 4: # to limit memory usage break if len(self.buffer ) > self.params.distance_target_buffer_size: break close_i = min(i + close, len(trajectory)) far_i = min(i + far, len(trajectory)) # sample close observation pair first_idx = i second_idx = np.random.randint(i, close_i) # in TMAX we do some additional checks add_close = True if check_tmax: both_frames_random = trajectory.is_random[ first_idx] and trajectory.is_random[second_idx] first_exploration = trajectory.mode[ first_idx] == TmaxMode.EXPLORATION second_exploration = trajectory.mode[ second_idx] == TmaxMode.EXPLORATION if both_frames_random or (first_exploration and second_exploration): add_close = True else: add_close = False if add_close: if self.params.distance_symmetric and random.random( ) < 0.5: first_idx, second_idx = second_idx, first_idx self.buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=0) data_added += 1 num_close += 1 # sample far observation pair if far_i < len(trajectory): first_idx = i second_idx = np.random.randint(far_i, len(trajectory)) add_far = True if check_tmax: both_frames_random = trajectory.is_random[ first_idx] and trajectory.is_random[second_idx] first_exploration = trajectory.mode[ first_idx] == TmaxMode.EXPLORATION second_exploration = trajectory.mode[ second_idx] == TmaxMode.EXPLORATION if both_frames_random or (first_exploration and second_exploration): add_far = True else: add_far = False if add_far: if self.params.distance_symmetric and random.random( ) < 0.5: first_idx, second_idx = second_idx, first_idx self.buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=1) data_added += 1 num_far += 1 with timing.timeit('finalize'): self.buffer.trim_at(self.params.distance_target_buffer_size) if self.batch_num % 20 == 0: with timing.timeit('visualize'): self._visualize_data() self.batch_num += 1 log.info('num close %d, num far %d, distance net timing %s', num_close, num_far, timing) def has_enough_data(self): len_data, min_data = len( self.buffer), self.params.distance_target_buffer_size // 3 if len_data < min_data: log.info('Need to gather more data to train distance net, %d/%d', len_data, min_data) return False return True def shuffle_data(self): self.buffer.shuffle_data() def reset(self): self.buffer.clear() def _visualize_data(self): min_vis = 10 if len(self.buffer) < min_vis: return close_examples, far_examples = [], [] labels = self.buffer.labels obs_first, obs_second = self.buffer.obs_first, self.buffer.obs_second for i in range(len(labels)): if labels[i] == 0 and len(close_examples) < min_vis: close_examples.append((obs_first[i], obs_second[i])) elif labels[i] == 1 and len(far_examples) < min_vis: far_examples.append((obs_first[i], obs_second[i])) if len(close_examples) < min_vis or len(far_examples) < min_vis: return img_folder = vis_dir(self.params.experiment_dir()) img_folder = ensure_dir_exists(join(img_folder, 'dist')) img_folder = ensure_dir_exists(join(img_folder, f'dist_{time.time()}')) def save_images(examples, close_or_far): for visualize_i in range(len(examples)): img_first_name = join( img_folder, f'{close_or_far}_{visualize_i}_first.png') img_second_name = join( img_folder, f'{close_or_far}_{visualize_i}_second.png') cv2.imwrite(img_first_name, examples[visualize_i][0]) cv2.imwrite(img_second_name, examples[visualize_i][1]) save_images(close_examples, 'close') save_images(far_examples, 'far') self._vis_dirs.append(img_folder) while len(self._vis_dirs) > 20: dir_name = self._vis_dirs.popleft() if os.path.isdir(dir_name): shutil.rmtree(dir_name)
def __init__(self, params): self.params = params self.batch_num = 0 self.buffer = Buffer() self._vis_dirs = deque([])
class LocomotionBuffer: """ Training data for the hindsight experience replay (for locomotion policy). """ def __init__(self, params): self.params = params self.batch_num = 0 self.buffer = Buffer() self._vis_dirs = deque([]) def extract_data(self, trajectories): timing = Timing() if len(trajectories) <= 0: return if len(self.buffer) > self.params.locomotion_experience_replay_buffer: return with timing.timeit('trajectories'): max_trajectory = self.params.locomotion_max_trajectory data_so_far = 0 trajectories = [ t for t in trajectories if len(t) > self.params.locomotion_max_trajectory ] # train only on random frames random_frames = [[ i for i, is_random in enumerate(t.is_random) if is_random ] for t in trajectories] total_experience = sum(len(frames) for frames in random_frames) max_total_experience = 0.75 * total_experience # max fraction of experience to use max_num_segments = int(max_total_experience / max_trajectory) log.info( '%d total experience from %d trajectories (%d segments)', max_total_experience, len(trajectories), max_num_segments, ) attempts = 0 while data_so_far < max_total_experience: attempts += 1 if attempts > 100 * max_total_experience: # just in case break trajectory_idx = random.choice(range(len(trajectories))) trajectory = trajectories[trajectory_idx] if len(random_frames[trajectory_idx]) <= max_trajectory: continue first_random_frame = random_frames[trajectory_idx][0] if len(trajectory) - first_random_frame < max_trajectory: continue # sample random interval in trajectory, treat the last frame as "imaginary" goal, use actions as # ground truth start_idx = random.randint(first_random_frame, len(trajectory) - 2) goal_idx = min(start_idx + max_trajectory, len(trajectory) - 1) assert start_idx < goal_idx if not trajectory.is_random[start_idx]: continue if not trajectory.is_random[goal_idx]: continue for i in range(start_idx, goal_idx): if not trajectory.is_random[i]: continue assert 0 < goal_idx - i <= max_trajectory self.buffer.add( obs_prev=trajectory.obs[max(0, i - 1)], obs_curr=trajectory.obs[i], obs_goal=trajectory.obs[goal_idx], actions=trajectory.actions[i], mode=trajectory.mode[i], diff=goal_idx - i, ) data_so_far += 1 if len(self.buffer ) > self.params.locomotion_experience_replay_buffer: break # if self.batch_num % 10 == 0: # with timing.timeit('vis'): # self._visualize_data(training_data) # with timing.timeit('finalize'): # for traj_buffer in training_data: # self.buffer.add_buff(traj_buffer) # self.shuffle_data() # self.buffer.trim_at(self.params.locomotion_experience_replay_buffer) self.batch_num += 1 log.info('Locomotion, buffer size: %d, timing: %s', len(self.buffer), timing) def has_enough_data(self): len_data, min_data = len( self.buffer), self.params.locomotion_experience_replay_buffer // 3 if len_data < min_data: log.info('Need to gather more data to train locomotion net, %d/%d', len_data, min_data) return False return True def shuffle_data(self): permutation = self.buffer.shuffle_data(return_permutation=True) return permutation def reset(self): self.buffer.clear()
def train_loop(agent, multi_env): params = agent.params observations = main_observation(multi_env.reset()) infos = multi_env.info() trajectory_buffer = TrajectoryBuffer(multi_env.num_envs) step, env_steps = agent.session.run([agent.curiosity.distance.step, agent.total_env_steps]) loop_time = deque([], maxlen=2500) advanced_steps = deque([], maxlen=2500) t = Timing() complete_trajectories = [] num_to_process = 20 test_buffer = Buffer() num_test_data = 5000 while True: with t.timeit('loop'): with t.timeit('step'): actions = np.random.randint(0, agent.actor_critic.num_actions, params.num_envs) new_obs, rewards, dones, new_infos = multi_env.step(actions) with t.timeit('misc'): trajectory_buffer.add(observations, actions, infos, dones) observations = main_observation(new_obs) infos = new_infos num_steps_delta = num_env_steps(infos) env_steps += num_steps_delta complete_trajectories.extend(trajectory_buffer.complete_trajectories) trajectory_buffer.reset_trajectories() with t.timeit('train'): while len(complete_trajectories) > num_to_process: buffer = generate_training_data(complete_trajectories[:num_to_process], params) complete_trajectories = complete_trajectories[num_to_process:] if len(test_buffer) <= 0: buffer.shuffle_data() test_buffer = Buffer() test_buffer.add_buff(buffer, max_to_add=num_test_data) else: step = agent.curiosity.distance.train(buffer, env_steps, agent) agent.curiosity.distance.calc_test_error(test_buffer, env_steps, agent) if t.train > 1.0: log.debug('Training time: %s', t) loop_time.append(t.loop) advanced_steps.append(num_steps_delta) if env_steps % 100 == 0: avg_fps = sum(advanced_steps) / sum(loop_time) log.info('Step %d, avg. fps %.1f, training steps %d, timing: %s', env_steps, avg_fps, step, t)
def generate_training_data(trajectories, params): timing = Timing() with timing.timeit('trajectories'): close, far = params.close_threshold, params.far_threshold trajectory_joined = Trajectory(0) trajectory_idx = [] episode_ends = [] for i, t in enumerate(trajectories): trajectory_joined.add_trajectory(t) trajectory_idx.extend([i] * len(t)) episode_ends.append(len(trajectory_joined)) obs = trajectory_joined.obs indices = list(range(len(trajectory_joined))) np.random.shuffle(indices) buffer = Buffer() num_close, num_far = 0, 0 for i in indices: # sample close observation pair close_i = min(i + close, len(trajectory_joined)) first_idx = i second_idx = np.random.randint(i, close_i) if trajectory_idx[first_idx] == trajectory_idx[second_idx]: if params.distance_symmetric and random.random() < 0.5: first_idx, second_idx = second_idx, first_idx buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=0) num_close += 1 # sample far observation pair next_episode_end = 0 for next_episode_end in episode_ends: if next_episode_end > i: break if random.random() < 0.3: max_len = len(trajectory_joined) else: max_len = next_episode_end far_i = min(i + far, max_len) if far_i < max_len: first_idx = i second_idx = np.random.randint(far_i, max_len) if params.distance_symmetric and random.random() < 0.5: first_idx, second_idx = second_idx, first_idx buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=1) num_far += 1 log.info( 'Processed %d trajectories, total %d, close %d, far %d, timing: %s', len(trajectories), len(buffer), num_close, num_far, timing, ) return buffer
def test_buffer_performance(self): small_buffer = Buffer() small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8)) buffer = Buffer() t = Timing() with t.timeit('add'): for i in range(100): buffer.add_buff(small_buffer) huge_buffer = Buffer() with t.timeit('add_huge'): huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('single_add_small'): huge_buffer.add_buff(small_buffer) with t.timeit('clear_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('shuffle_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(small_buffer) with t.timeit('shuffle'): huge_buffer.shuffle_data() log.debug('Timing: %s', t)
def test_buffer(self): buff = Buffer() buff.add(a=1, b='b', c=None, d=3.14) self.assertEqual(len(buff), 1) self.assertGreaterEqual(buff._capacity, 1) self.assertEqual(buff.a[0], 1) self.assertEqual(buff.b[0], 'b') buff.add_many(a=[2, 3], b=['c', 'd'], c=[None, list()], d=[2.71, 1.62]) self.assertEqual(len(buff), 3) self.assertGreaterEqual(buff._capacity, 3) self.assertTrue(np.array_equal(buff.a, [1, 2, 3])) self.assertTrue(np.array_equal(buff.b, ['b', 'c', 'd'])) buff.trim_at(5) self.assertTrue(np.array_equal(buff.a, [1, 2, 3])) buff.trim_at(2) self.assertTrue(np.array_equal(buff.a, [1, 2])) buff.add_many(a=[2, 3], b=['c', 'd'], c=[None, list()], d=[2.71, 1.62]) buff.shuffle_data() buff.shuffle_data() buff.shuffle_data() buff.trim_at(1) self.assertIn(buff.a[0], [1, 2, 3]) self.assertEqual(len(buff), 1) self.assertGreaterEqual(buff._capacity, 4) buff_temp = Buffer() buff_temp.add(a=10, b='e', c=dict(), d=9.81) buff.add_buff(buff_temp) self.assertEqual(len(buff), 2) buff.clear() self.assertEqual(len(buff), 0)