def reset(self, report_queue): """ Do the very first reset for all environments in a vector. Populate shared memory with initial obs. Note that this is called only once, at the very beginning of training. After this the envs should auto-reset. :param report_queue: we use report queue to monitor reset progress (see appo.py). This can be a lengthy process. :return: first requests for policy workers (to generate actions for the very first env step) """ for env_i, e in enumerate(self.envs): observations = e.reset() if self.cfg.decorrelate_envs_on_one_worker: decorrelate_steps = self.cfg.rollout * env_i + 1 log.info('Decorrelating experience for %d frames...', decorrelate_steps) for decorrelate_step in range(decorrelate_steps): actions = [e.action_space.sample() for _ in range(self.num_agents)] observations, rew, dones, info = e.step(actions) for agent_i, obs in enumerate(observations): actor_state = self.actor_states[env_i][agent_i] actor_state.set_trajectory_data(dict(obs=obs), self.traj_buffer_idx, self.rollout_step) # rnn state is already initialized at zero # log.debug( # 'Reset progress w:%d-%d finished %d/%d, still initializing envs...', # self.worker_idx, self.split_idx, env_i + 1, len(self.envs), # ) report_queue.put(dict(initialized_env=(self.worker_idx, self.split_idx, env_i))) policy_request = self._format_policy_request() return policy_request
def enjoy(params, env_id, max_num_episodes=1000000, max_num_frames=1e9, fps=20): def make_env_func(): e = create_env(env_id, mode='test', skip_frames=True) e.seed(0) return e agent = AgentPPO(make_env_func, params.load()) env = make_env_func() # this helps with screen recording pause_at_the_beginning = False if pause_at_the_beginning: env.render() log.info('Press any key to start...') cv2.waitKey() return run_policy_loop(agent, env, max_num_episodes, fps, max_num_frames=max_num_frames, deterministic=False)
def __init__(self, env, ph_obs, params=None): """ :param env :param ph_obs - placeholder for observations """ with tf.variable_scope('rnd'): self.params = params self.ph_obs = ph_obs reg = None # don't use regularization obs_space = main_observation_space(env) target_enc_params = get_enc_params(params, 'rnd_target') encoder_obs = make_encoder(ph_obs, obs_space, reg, target_enc_params, name='target_encoder') self.predicted_features = encoder_obs.encoded_input predictor_enc_params = get_enc_params(params, 'rnd_predictor') target_features = make_encoder(ph_obs, obs_space, reg, predictor_enc_params, name='predictor_encoder') self.tgt_features = tf.stop_gradient(target_features.encoded_input) self.feature_vector_size = self.predicted_features.get_shape().as_list()[-1] log.info('Feature vector size in RND module: %d', self.feature_vector_size) self.objectives = self._objectives() self._add_summaries() self.summaries = merge_summaries(collections=['rnd']) self.step = tf.Variable(0, trainable=False, dtype=tf.int64, name='rnd_step') opt = tf.train.AdamOptimizer(learning_rate=self.params.learning_rate, name='rnd_opt') self.train_rnd = opt.minimize(self.objectives.loss, global_step=self.step)
def _save(self): checkpoint = self._get_checkpoint_dict() assert checkpoint is not None checkpoint_dir = self.checkpoint_dir(self.cfg, self.policy_id) tmp_filepath = join(checkpoint_dir, '.temp_checkpoint') checkpoint_name = f'checkpoint_{self.train_step:09d}_{self.env_steps}.pth' filepath = join(checkpoint_dir, checkpoint_name) log.info('Saving %s...', tmp_filepath) torch.save(checkpoint, tmp_filepath) log.info('Renaming %s to %s', tmp_filepath, filepath) os.rename(tmp_filepath, filepath) while len(self.get_checkpoints( checkpoint_dir)) > self.cfg.keep_checkpoints: oldest_checkpoint = self.get_checkpoints(checkpoint_dir)[0] if os.path.isfile(oldest_checkpoint): log.debug('Removing %s', oldest_checkpoint) os.remove(oldest_checkpoint) if self.cfg.save_milestones_sec > 0: # milestones enabled if time.time( ) - self.last_milestone_time >= self.cfg.save_milestones_sec: milestones_dir = ensure_dir_exists( join(checkpoint_dir, 'milestones')) milestone_path = join(milestones_dir, f'{checkpoint_name}.milestone') log.debug('Saving a milestone %s', milestone_path) shutil.copy(filepath, milestone_path) self.last_milestone_time = time.time()
def test_multi_env_performance(test, env_type, num_envs, num_workers): t = Timing() with t.timeit('init'): multi_env = MultiEnv(num_envs, num_workers, test.make_env, stats_episodes=100) total_num_frames, frames = 20000, 0 with t.timeit('first_reset'): multi_env.reset() next_print = print_step = 10000 with t.timeit('experience'): while frames < total_num_frames: _, _, done, info = multi_env.step([0] * num_envs) frames += num_env_steps(info) if frames > next_print: log.info('Collected %d frames of experience...', frames) next_print += print_step fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Timing: %s', t) multi_env.close()
def calc_distance_to_memory(agent, sparse_map, obs): distance_net = agent.curiosity.distance num_landmarks = sparse_map.num_landmarks() curr_obs = [obs] * num_landmarks map_obs = [ sparse_map.get_observation(node) for node in sparse_map.graph.nodes ] distances = distance_net.distances_from_obs( agent.session, obs_first=map_obs, obs_second=curr_obs, ) min_d, min_d_idx = min_with_idx(distances) global last_distances last_distances.append(min_d) # log.info('Avg.distance: %.3f', np.mean(last_distances)) log.info('Curr.distance: %.3f', min_d) import cv2 closest_node = list(sparse_map.graph.nodes)[min_d_idx] closest_obs = sparse_map.get_observation(closest_node) cv2.imshow( 'closest_obs', cv2.resize(cv2.cvtColor(closest_obs, cv2.COLOR_RGB2BGR), (420, 420))) cv2.waitKey(1)
def set_gpus_for_process(process_idx, num_gpus_per_process, process_type, gpu_mask=None): available_gpus = get_available_gpus() if gpu_mask is not None: assert len(available_gpus) >= len(available_gpus) available_gpus = [available_gpus[g] for g in gpu_mask] num_gpus = len(available_gpus) gpus_to_use = [] if num_gpus == 0: os.environ[CUDA_ENVVAR] = '' log.debug('Not using GPUs for %s process %d', process_type, process_idx) else: first_gpu_idx = process_idx * num_gpus_per_process for i in range(num_gpus_per_process): index_mod_num_gpus = (first_gpu_idx + i) % num_gpus gpus_to_use.append(available_gpus[index_mod_num_gpus]) os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use]) log.info( 'Set environment var %s to %r for %s process %d', CUDA_ENVVAR, os.environ[CUDA_ENVVAR], process_type, process_idx, ) log.debug('Visible devices: %r', torch.cuda.device_count()) return gpus_to_use
def _advance_rollouts(self, data, timing): split_idx = data['split_idx'] runner = self.env_runners[split_idx] policy_request, complete_rollouts, episodic_stats = runner.advance_rollouts(data, timing) with timing.add_time('complete_rollouts'): if complete_rollouts: self._enqueue_complete_rollouts(split_idx, complete_rollouts) if self.num_complete_rollouts == 0 and not self.cfg.benchmark: # we just finished our first complete rollouts, perfect time to wait for experience derorrelation # this guarantees that there won't be any "old" trajectories when we awaken delay = (float(self.worker_idx) / self.cfg.num_workers) * self.cfg.decorrelate_experience_max_seconds log.info( 'Worker %d, sleep for %.3f sec to decorrelate experience collection', self.worker_idx, delay, ) time.sleep(delay) log.info('Worker %d awakens!', self.worker_idx) self.num_complete_rollouts += len(complete_rollouts['rollouts']) with timing.add_time('enqueue_policy_requests'): if policy_request is not None: self._enqueue_policy_request(split_idx, policy_request) if episodic_stats: self._report_stats(episodic_stats)
def _handle_reset(self): for split_idx, env_runner in enumerate(self.env_runners): policy_inputs = env_runner.reset(self.report_queue) self._enqueue_policy_request(split_idx, policy_inputs) log.info('Finished reset for worker %d', self.worker_idx) self.report_queue.put(dict(finished_reset=self.worker_idx))
def run_policy_loop(agent, env, max_num_episodes, fps=7, deterministic=False): """Execute the policy and render onto the screen, using the standard agent interface.""" agent.initialize() episode_rewards = [] for _ in range(max_num_episodes): obs, done = env.reset(), False episode_reward = 0 while not done: start = time.time() env.render() if fps < 1000: time.sleep(1.0 / fps) action = agent.best_action(obs, deterministic=deterministic) obs, rew, done, _ = env.step(action) episode_reward += rew log.info('Actual fps: %.1f', 1.0 / (time.time() - start)) env.render() time.sleep(0.2) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) agent.finalize() env.close() return 0
def __init__(self, cfg): self.cfg = cfg if self.cfg.seed is not None: log.info('Settings fixed seed %d', self.cfg.seed) torch.manual_seed(self.cfg.seed) np.random.seed(self.cfg.seed) self.device = torch.device('cuda') self.train_step = self.env_steps = 0 self.total_train_seconds = 0 self.last_training_step = time.time() self.best_avg_reward = math.nan self.summary_rate_decay = LinearDecay([(0, 100), (1000000, 2000), (10000000, 10000)]) self.last_summary_written = -1e9 self.save_rate_decay = LinearDecay([(0, self.cfg.initial_save_rate), (1000000, 5000)], staircase=100) summary_dir = summaries_dir(experiment_dir(cfg=self.cfg)) self.writer = SummaryWriter(summary_dir, flush_secs=10)
def run(args, config): local_mode = False if args.dbg: local_mode = True ray.init(local_mode=local_mode) cls = get_agent_class(args._run) agent = cls(env=args.env, config=config) agent.restore(args.checkpoint) num_steps = int(1e9) render_frameskip = args.render_action_repeat if render_frameskip == -1: # default - read from config # fallback to default if env config does not have it render_frameskip = cfg_param('skip_frames', config.get('env_config', None)) log.info('Using render frameskip %d! \n\n\n', render_frameskip) rollout_loop( agent, args.env, num_steps, num_episodes=args.num_episodes, no_render=args.no_render, fps=args.fps, frameskip=render_frameskip, )
def _init(self): """ Initialize env runners, that actually do all the work. Also we're doing some utility stuff here, e.g. setting process affinity (this is a performance optimization). """ log.info('Initializing envs for env runner %d...', self.worker_idx) if self.cfg.force_envs_single_thread: from threadpoolctl import threadpool_limits threadpool_limits(limits=1, user_api=None) if self.cfg.set_workers_cpu_affinity: set_process_cpu_affinity(self.worker_idx, self.cfg.num_workers) psutil.Process().nice(min(self.cfg.default_niceness + 10, 20)) self.env_runners = [] for split_idx in range(self.num_splits): env_runner = VectorEnvRunner( self.cfg, self.vector_size // self.num_splits, self.worker_idx, split_idx, self.num_agents, self.shared_buffers, self.reward_shaping, ) env_runner.init() self.env_runners.append(env_runner)
def cat(self, dict_of_tensor_arrays, macro_batch_size, use_pinned_memory, timing): """ Here 'macro_batch' is the overall size of experience per iteration. Macro-batch = mini-batch * num_batches_per_iteration """ tensor_batch = self.batch_pool.get() if tensor_batch is not None: old_batch_size = tensor_batch_size(tensor_batch) if old_batch_size != macro_batch_size: # this can happen due to PBT changing batch size during the experiment log.warning('Tensor macro-batch size changed from %d to %d!', old_batch_size, macro_batch_size) log.warning('Discarding the cached tensor batch!') del tensor_batch tensor_batch = None if tensor_batch is None: tensor_batch = copy_dict_structure(dict_of_tensor_arrays) log.info('Allocating new CPU tensor batch (could not get from the pool)') for d1, cache_d, key, tensor_arr, _ in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch): cache_d[key] = torch.cat(tensor_arr, dim=0) if use_pinned_memory: cache_d[key] = cache_d[key].pin_memory() else: with timing.add_time('batcher_mem'): for d1, cache_d, key, tensor_arr, cache_t in iter_dicts_recursively(dict_of_tensor_arrays, tensor_batch): offset = 0 for t in tensor_arr: first_dim = t.shape[0] cache_t[offset:offset + first_dim].copy_(t) offset += first_dim return tensor_batch
def _init(self, init_info): log.info('Initializing env for player %d, init_info: %r...', self.player_id, init_info) env = init_multiplayer_env(self.make_env_func, self.player_id, self.env_config, init_info) env.reset() return env
def main(): experiments_dir = '/home/alex/all/projects/sample-factory/train_dir' all_experiment_dirs_list = [join(experiments_dir, v['dir']) for k, v in EXPERIMENTS.items()] for experiment_dir in all_experiment_dirs_list: log.debug('Experiment dir: %s', experiment_dir) log.debug('Total: %d', len(all_experiment_dirs_list)) for env, details in EXPERIMENTS.items(): env_dir = details['dir'] env_dir = join(experiments_dir, env_dir) event_files = Path(env_dir).rglob('*.tfevents.*') event_files = list(event_files) log.info('Event files: %r', event_files) env_dirs = set() for event_file in event_files: env_dirs.add(os.path.dirname(event_file)) EXPERIMENTS[env]['dirs'] = sorted(list(env_dirs)) log.info('Env dirs for env %s is %r', env, env_dirs) EXPERIMENT_GROUPS = (('dmlab30',),) for group_i, exp_group in enumerate(EXPERIMENT_GROUPS): fig, ax = plt.subplots(1, 1) ax = [ax] count = 0 for env in exp_group: experiments = EXPERIMENTS[env]['dirs'] aggregate(env, experiments, count, ax[count]) count += 1 # handles, labels = ax[-1].get_legend_handles_labels() # lgd = fig.legend(handles, labels, bbox_to_anchor=(0.1, 0.88, 0.8, 0.2), loc='lower left', ncol=4, mode="expand", prop={'size': 6}) # lgd.set_in_layout(True) # zhehui # plt.show() # plot_name = f'{env}_{key.replace("/", " ")}' # plt.tight_layout() # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=1, wspace=0) # plt.subplots_adjust(wspace=0.12, hspace=0.15) plt.tight_layout(rect=(0, 0, 1.0, 0.9)) plt.margins(0, 0) plot_name = f'dmlab30' plt.savefig( os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_inches='tight', pad_inches=0, ) # plt.savefig(os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_extra_artists=(lgd,)) return 0
def parse_args(default_env, default_experiment_name, params_cls): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) # common args parser.add_argument('--experiment', type=str, default=None) parser.add_argument('--env', type=str, default=default_env) # params object args params_cls.add_cli_args(parser) args = parser.parse_args() experiment = args.experiment if experiment is None: experiment = get_experiment_name(args.env, default_experiment_name) params = params_cls(experiment) params.set_command_line(sys.argv) params.update(args) log.info('Config:') for arg in vars(args): log.info('%s %r', arg, getattr(args, arg)) return args, params
def _train_critic(self, buffer, env_steps): # train critic summary = None critic_step = self.critic_step.eval(session=self.session) prev_loss = 1e10 for epoch in range(self.params.ppo_epochs): losses = [] buffer.shuffle() for i in range(0, len(buffer), self.params.batch_size): with_summaries = self._should_write_summaries(critic_step) and summary is None summaries = [self.critic_summaries] if with_summaries else [] start, end = i, i + self.params.batch_size feed_dict = self.input_dict(buffer, start, end) result = self.session.run( [self.objectives.critic_loss, self.train_critic] + summaries, feed_dict=feed_dict) critic_step += 1 losses.append(result[0]) if with_summaries: summary = result[-1] self.summary_writer.add_summary(summary, global_step=env_steps) # check loss improvement at the end of each epoch, early stop if necessary avg_loss = np.mean(losses) if avg_loss >= prev_loss: log.info('Early stopping after %d epochs because critic did not improve', epoch) log.info('Was %.4f now %.4f, ratio %.3f', prev_loss, avg_loss, avg_loss / prev_loss) break prev_loss = avg_loss
def train_distance(params, env_id): def make_env_func(): e = create_env(env_id) return e agent = AgentTMAX(make_env_func, params) agent.initialize() multi_env = None try: multi_env = MultiEnv( params.num_envs, params.num_workers, make_env_func=agent.make_env_func, stats_episodes=params.stats_episodes, ) train_loop(agent, multi_env) except (Exception, KeyboardInterrupt, SystemExit): log.exception('Interrupt...') finally: log.info('Closing env...') if multi_env is not None: multi_env.close() agent.finalize() return 0
def reset(self): if self._episode_recording_dir is not None and self._record_id > 0: # save actions to text file with open(join(self._episode_recording_dir, 'actions.json'), 'w') as actions_file: json.dump(self._recorded_actions, actions_file) # rename previous episode dir reward = self._recorded_episode_reward + self._recorded_episode_shaping_reward new_dir_name = self._episode_recording_dir + f'_r{reward:.2f}' os.rename(self._episode_recording_dir, new_dir_name) log.info( 'Finished recording %s (rew %.3f, shaping %.3f)', new_dir_name, reward, self._recorded_episode_shaping_reward, ) dir_name = f'ep_{self._record_id:03d}_p{self._player_id}' self._episode_recording_dir = join(self._record_to, dir_name) ensure_dir_exists(self._episode_recording_dir) self._record_id += 1 self._frame_id = 0 self._recorded_episode_reward = 0 self._recorded_episode_shaping_reward = 0 self._recorded_actions = [] return self.env.reset()
def test_running_mean_std(self): running_mean_std = RunningMeanStd(max_past_samples=100000) true_mu, true_sigma, batch_size = -1, 3, 256 x = np.random.normal(true_mu, true_sigma, batch_size) running_mean_std.update(x) # after 1 batch we should have almost the exact same batch_mean = np.mean(x, axis=0) batch_var = np.var(x, axis=0) self.assertAlmostEqual(running_mean_std.mean, batch_mean, places=5) self.assertAlmostEqual(running_mean_std.var, batch_var, places=5) self.assertAlmostEqual(running_mean_std.count, batch_size, places=3) # after many batches we should have an accurate estimate for _ in range(1000): x = np.random.normal(true_mu, true_sigma, batch_size) running_mean_std.update(x) log.info('estimated mean %.2f variance %.2f', running_mean_std.mean, running_mean_std.var) self.assertAlmostEqual(running_mean_std.mean, true_mu, places=0) self.assertAlmostEqual(running_mean_std.var, true_sigma**2, places=0)
def safe_get(q, timeout=1e6, msg='Queue timeout'): """Using queue.get() with timeout is necessary, otherwise KeyboardInterrupt is not handled.""" while True: try: return q.get(timeout=timeout) except Empty: log.info('Queue timed out (%s), timeout %.3f', msg, timeout)
def _load_state(self, checkpoint_dict, load_progress=True): if load_progress: self.train_step = checkpoint_dict['train_step'] self.env_steps = checkpoint_dict['env_steps'] self.actor_critic.load_state_dict(checkpoint_dict['model']) self.optimizer.load_state_dict(checkpoint_dict['optimizer']) log.info('Loaded experiment state at training iteration %d, env step %d', self.train_step, self.env_steps)
def _ensure_initialized(self): if self.initialized: return num_attempts = 25 attempt = 0 for attempt in range(num_attempts): self.workers = [ MultiAgentEnvWorker(i, self.make_env_func, self.env_config) for i in range(self.num_agents) ] try: port_to_use = udp_port_num(self.env_config) port = find_available_port(port_to_use, increment=1000) log.debug('Using port %d', port) init_info = dict(port=port) for i, worker in enumerate(self.workers): worker.task_queue.put((init_info, TaskType.INIT)) if self.safe_init: time.sleep(1.0) # just in case else: time.sleep(0.01) for i, worker in enumerate(self.workers): worker.result_queue.get(timeout=5) worker.result_queue.task_done() worker.task_queue.join() except Exception as exc: for worker in self.workers: if isinstance(worker.process, threading.Thread): log.info( 'We cannot really kill a thread, so let the whole process die' ) raise RuntimeError( 'Critical error: worker stuck on initialization. Abort!' ) else: log.info('Killing process %r', worker.process.pid) kill(worker.process.pid) del self.workers log.warning('Could not initialize env, try again! Error: %r', exc) time.sleep(1) else: break if attempt >= num_attempts: log.error('Could not initialize env even after %d attempts. Fail!', attempt) raise RuntimeError( 'Critical error: worker stuck on initialization, num attempts exceeded. Abort!' ) log.debug('%d agent workers initialized for env %d!', len(self.workers), self.env_config.worker_index) log.debug('Took %d attempts!\n', attempt + 1) self.initialized = True
def _load_state(self, checkpoint_dict): self.train_step = checkpoint_dict['train_step'] self.env_steps = checkpoint_dict['env_steps'] self.best_avg_reward = checkpoint_dict['best_avg_reward'] self.total_train_seconds = checkpoint_dict['total_train_seconds'] log.info( 'Loaded experiment state at training iteration %d, env step %d', self.train_step, self.env_steps)
def has_enough_data(self): len_data, min_data = len( self.buffer), self.params.distance_target_buffer_size // 3 if len_data < min_data: log.info('Need to gather more data to train distance net, %d/%d', len_data, min_data) return False return True
def has_enough_data(self): len_data, min_data = len( self.buffer), self.params.locomotion_experience_replay_buffer // 3 if len_data < min_data: log.info('Need to gather more data to train locomotion net, %d/%d', len_data, min_data) return False return True
def _maybe_save(self, step, env_steps): self.params.ensure_serialized() save_every = self.save_rate_decay.at(step) if (step + 1) % save_every == 0: log.info('Training step #%d, env steps: %d, saving...', step, env_steps) saver_path = model_dir(self.params.experiment_dir()) + '/' + self.__class__.__name__ self.session.run(self.update_env_steps, feed_dict={self.total_env_steps_placeholder: env_steps}) self.saver.save(self.session, saver_path, global_step=step)
def close(self): log.info('Stopping multi env wrapper...') for worker in self.workers: worker.task_queue.put((None, MsgType.TERMINATE)) time.sleep(0.1) for worker in self.workers: worker.process.join()
def _init(self, envs): log.info('Initializing envs %s...', list_to_string(self.env_indices)) for i in self.env_indices: env = self.make_env_func() env.seed(i) env.reset() envs.append(env) time.sleep(0.01)