def forward_pass(device_type): env_name = 'atari_breakout' cfg = default_cfg(algo='APPO', env=env_name) cfg.actor_critic_share_weights = True cfg.hidden_size = 128 cfg.use_rnn = True cfg.env_framestack = 4 env = create_env(env_name, cfg=cfg) torch.set_num_threads(1) torch.backends.cudnn.benchmark = True actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device(device_type) actor_critic.to(device) timing = Timing() with timing.timeit('all'): batch = 128 with timing.add_time('input'): # better avoid hardcoding here... observations = dict(obs=torch.rand([batch, 4, 84, 84]).to(device)) rnn_states = torch.rand([batch, get_hidden_size(cfg)]).to(device) n = 200 for i in range(n): with timing.add_time('forward'): output = actor_critic(observations, rnn_states) log.debug('Progress %d/%d', i, n) log.debug('Timing: %s', timing)
def enjoy(cfg, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) # env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() rnn_states = torch.zeros( [env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = np.zeros(env.num_agents) finished_episode = [False] * env.num_agents with torch.no_grad(): while not max_frames_reached(num_frames): obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) # sample actions from the distribution by default actions = policy_outputs.actions action_distribution = policy_outputs.action_distribution if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += rew num_frames += 1 for agent_i, done_flag in enumerate(done): if done_flag: finished_episode[agent_i] = True episode_rewards[agent_i].append( episode_reward[agent_i]) true_rewards[agent_i].append(infos[agent_i].get( 'true_reward', math.nan)) log.info( 'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f', agent_i, num_frames, episode_reward[agent_i], true_rewards[agent_i][-1]) rnn_states[agent_i] = torch.zeros( [get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward[agent_i] = 0 # if episode terminated synchronously for all agents, pause a bit before starting a new one if all(done): if not cfg.no_render: env.render() time.sleep(0.05) if all(finished_episode): finished_episode = [False] * env.num_agents avg_episode_rewards_str, avg_true_reward_str = '', '' for agent_i in range(env.num_agents): avg_rew = np.mean(episode_rewards[agent_i]) avg_true_rew = np.mean(true_rewards[agent_i]) if not np.isnan(avg_rew): if avg_episode_rewards_str: avg_episode_rewards_str += ', ' avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}' if not np.isnan(avg_true_rew): if avg_true_reward_str: avg_true_reward_str += ', ' avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}' log.info('Avg episode rewards: %s, true rewards: %s', avg_episode_rewards_str, avg_true_reward_str) log.info( 'Avg episode reward: %.3f, avg true_reward: %.3f', np.mean([ np.mean(episode_rewards[i]) for i in range(env.num_agents) ]), np.mean([ np.mean(true_rewards[i]) for i in range(env.num_agents) ])) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def multi_agent_match(policy_indices, max_num_episodes=int(1e9), max_num_frames=1e10): log.debug('Starting eval process with policies %r', policy_indices) for i, rival in enumerate(RIVALS): rival.policy_index = policy_indices[i] curr_dir = os.path.dirname(os.path.abspath(__file__)) evaluation_filename = join(curr_dir, f'eval_{"vs".join([str(pi) for pi in policy_indices])}.txt') with open(evaluation_filename, 'w') as fobj: fobj.write('start\n') common_config = RIVALS[0].cfg render_action_repeat = common_config.render_action_repeat if common_config.render_action_repeat is not None else common_config.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) common_config.env_frameskip = 1 # for evaluation common_config.num_envs = 1 common_config.timelimit = 4.0 # for faster evaluation def make_env_func(env_config): return create_env(ENV_NAME, cfg=common_config, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) else: assert env.num_agents == len(RIVALS) device = torch.device('cuda') for rival in RIVALS: rival.actor_critic = create_actor_critic(rival.cfg, env.observation_space, env.action_space) rival.actor_critic.model_to_device(device) policy_id = rival.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(rival.cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) rival.actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames wins = [0 for _ in RIVALS] ties = 0 frag_differences = [] with torch.no_grad(): for _ in range(max_num_episodes): obs = env.reset() obs_dict_torch = dict() done = [False] * len(obs) for rival in RIVALS: rival.rnn_states = torch.zeros([1, rival.cfg.hidden_size], dtype=torch.float32, device=device) episode_reward = 0 prev_frame = time.time() while True: actions = [] for i, obs_dict in enumerate(obs): for key, x in obs_dict.items(): obs_dict_torch[key] = torch.from_numpy(x).to(device).float().view( 1, *x.shape) rival = RIVALS[i] policy_outputs = rival.actor_critic(obs_dict_torch, rival.rnn_states) rival.rnn_states = policy_outputs.rnn_states actions.append(policy_outputs.actions[0].cpu().numpy()) for _ in range(render_action_repeat): if not NO_RENDER: target_delay = 1.0 / FPS if FPS > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) if all(done): log.debug('Finished episode!') frag_diff = infos[0]['PLAYER1_FRAGCOUNT'] - infos[0]['PLAYER2_FRAGCOUNT'] if frag_diff > 0: wins[0] += 1 elif frag_diff < 0: wins[1] += 1 else: ties += 1 frag_differences.append(frag_diff) avg_frag_diff = np.mean(frag_differences) report = f'wins: {wins}, ties: {ties}, avg_frag_diff: {avg_frag_diff}' with open(evaluation_filename, 'a') as fobj: fobj.write(report + '\n') # log.info('%d:%d', infos[0]['PLAYER1_FRAGCOUNT'], infos[0]['PLAYER2_FRAGCOUNT']) episode_reward += np.mean(rew) num_frames += 1 if num_frames % 100 == 0: log.debug('%.1f', render_action_repeat / (time.time() - prev_frame)) prev_frame = time.time() if all(done): log.info('Episode finished at %d frames', num_frames) break if all(done) or max_frames_reached(num_frames): break if not NO_RENDER: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close()
def init_model(self, timing): self.actor_critic = create_actor_critic(self.cfg, self.obs_space, self.action_space, timing) self.actor_critic.model_to_device(self.device) self.actor_critic.share_memory()
def _run(self): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) psutil.Process().nice(min(self.cfg.default_niceness + 2, 20)) cuda_envvars(self.policy_id) torch.multiprocessing.set_sharing_strategy('file_system') timing = Timing() with timing.timeit('init'): # initialize the Torch modules log.info('Initializing model on the policy worker %d-%d...', self.policy_id, self.worker_idx) torch.set_num_threads(1) if self.cfg.device == 'gpu': # we should already see only one CUDA device, because of env vars assert torch.cuda.device_count() == 1 self.device = torch.device('cuda', index=0) else: self.device = torch.device('cpu') self.actor_critic = create_actor_critic(self.cfg, self.obs_space, self.action_space, timing) self.actor_critic.model_to_device(self.device) for p in self.actor_critic.parameters(): p.requires_grad = False # we don't train anything here log.info('Initialized model on the policy worker %d-%d!', self.policy_id, self.worker_idx) last_report = last_cache_cleanup = time.time() last_report_samples = 0 request_count = deque(maxlen=50) # very conservative limit on the minimum number of requests to wait for # this will almost guarantee that the system will continue collecting experience # at max rate even when 2/3 of workers are stuck for some reason (e.g. doing a long env reset) # Although if your workflow involves very lengthy operations that often freeze workers, it can be beneficial # to set min_num_requests to 1 (at a cost of potential inefficiency, i.e. policy worker will use very small # batches) min_num_requests = self.cfg.num_workers // ( self.cfg.num_policies * self.cfg.policy_workers_per_policy) min_num_requests //= 3 min_num_requests = max(1, min_num_requests) # Again, very conservative timer. Only wait a little bit, then continue operation. wait_for_min_requests = 0.025 while not self.terminate: try: while self.stop_experience_collection[self.policy_id]: with self.resume_experience_collection_cv: self.resume_experience_collection_cv.wait(timeout=0.05) waiting_started = time.time() while len(self.requests) < min_num_requests and time.time( ) - waiting_started < wait_for_min_requests: try: with timing.timeit('wait_policy'), timing.add_time( 'wait_policy_total'): policy_requests = self.policy_queue.get_many( timeout=0.005) self.requests.extend(policy_requests) except Empty: pass self._update_weights(timing) with timing.timeit('one_step'), timing.add_time( 'handle_policy_step'): if self.initialized: if len(self.requests) > 0: request_count.append(len(self.requests)) self._handle_policy_steps(timing) try: task_type, data = self.task_queue.get_nowait() # task from the task_queue if task_type == TaskType.INIT: self._init() elif task_type == TaskType.TERMINATE: self.terminate = True break elif task_type == TaskType.INIT_MODEL: self._init_model(data) self.task_queue.task_done() except Empty: pass if time.time() - last_report > 3.0 and 'one_step' in timing: timing_stats = dict(wait_policy=timing.wait_policy, step_policy=timing.one_step) samples_since_last_report = self.total_num_samples - last_report_samples stats = memory_stats('policy_worker', self.device) if len(request_count) > 0: stats['avg_request_count'] = np.mean(request_count) self.report_queue.put( dict( timing=timing_stats, samples=samples_since_last_report, policy_id=self.policy_id, stats=stats, )) last_report = time.time() last_report_samples = self.total_num_samples if time.time() - last_cache_cleanup > 300.0 or ( not self.cfg.benchmark and self.total_num_samples < 1000): if self.cfg.device == 'gpu': torch.cuda.empty_cache() last_cache_cleanup = time.time() except KeyboardInterrupt: log.warning('Keyboard interrupt detected on worker %d-%d', self.policy_id, self.worker_idx) self.terminate = True except: log.exception('Unknown exception on policy worker') self.terminate = True time.sleep(0.2) log.info('Policy worker avg. requests %.2f, timing: %s', np.mean(request_count), timing)
def enjoy(cfg, max_num_episodes=1000000, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = hasattr(env, 'num_agents') and env.num_agents > 1 if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints(LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] true_rewards = deque([], maxlen=100) num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() with torch.no_grad(): for _ in range(max_num_episodes): done = [False] * len(obs) rnn_states = torch.zeros([env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = 0 while True: obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) action_distribution = policy_outputs.action_distribution # sample actions from the distribution by default actions = policy_outputs.actions if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += np.mean(rew) num_frames += 1 if all(done): true_rewards.append(infos[0].get('true_reward', math.nan)) log.info('Episode finished at %d frames', num_frames) if not math.isnan(np.mean(true_rewards)): log.info('true rew %.3f avg true rew %.3f', true_rewards[-1], np.mean(true_rewards)) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) break if all(done) or max_frames_reached(num_frames): break if not cfg.no_render: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)