def _handle_policy_steps(self, timing): with torch.no_grad(): with timing.add_time('deserialize'): observations = AttrDict() rnn_states = [] traj_tensors = self.shared_buffers.tensors_individual_transitions for request in self.requests: actor_idx, split_idx, request_data = request for env_idx, agent_idx, traj_buffer_idx, rollout_step in request_data: index = actor_idx, split_idx, env_idx, agent_idx, traj_buffer_idx, rollout_step dict_of_lists_append(observations, traj_tensors['obs'], index) rnn_states.append(traj_tensors['rnn_states'][index]) self.total_num_samples += 1 with timing.add_time('stack'): for key, x in observations.items(): observations[key] = torch.stack(x) rnn_states = torch.stack(rnn_states) num_samples = rnn_states.shape[0] with timing.add_time('obs_to_device'): for key, x in observations.items(): device, dtype = self.actor_critic.device_and_type_for_input_tensor( key) observations[key] = x.to(device).type(dtype) rnn_states = rnn_states.to(self.device).float() with timing.add_time('forward'): policy_outputs = self.actor_critic(observations, rnn_states) with timing.add_time('to_cpu'): for key, output_value in policy_outputs.items(): policy_outputs[key] = output_value.cpu() with timing.add_time('format_outputs'): policy_outputs.policy_version = torch.empty( [num_samples]).fill_(self.latest_policy_version) # concat all tensors into a single tensor for performance output_tensors = [] for policy_output in self.shared_buffers.policy_outputs: tensor_name = policy_output.name output_value = policy_outputs[tensor_name].float() if len(output_value.shape) == 1: output_value.unsqueeze_(dim=1) output_tensors.append(output_value) output_tensors = torch.cat(output_tensors, dim=1) with timing.add_time('postprocess'): self._enqueue_policy_outputs(self.requests, output_tensors) self.requests = []
def _prepare_train_buffer(self, rollouts, macro_batch_size, timing): trajectories = [AttrDict(r['t']) for r in rollouts] with timing.add_time('buffers'): buffer = AttrDict() # by the end of this loop the buffer is a dictionary containing lists of numpy arrays for i, t in enumerate(trajectories): for key, x in t.items(): if key not in buffer: buffer[key] = [] buffer[key].append(x) # convert lists of dict observations to a single dictionary of lists for key, x in buffer.items(): if isinstance(x[0], (dict, OrderedDict)): buffer[key] = list_of_dicts_to_dict_of_lists(x) if not self.cfg.with_vtrace: with timing.add_time('calc_gae'): buffer = self._calculate_gae(buffer) with timing.add_time('batching'): # concatenate rollouts from different workers into a single batch efficiently # that is, if we already have memory for the buffers allocated, we can just copy the data into # existing cached tensors instead of creating new ones. This is a performance optimization. use_pinned_memory = self.cfg.device == 'gpu' buffer = self.tensor_batcher.cat(buffer, macro_batch_size, use_pinned_memory, timing) with timing.add_time('buff_ready'): for r in rollouts: self._mark_rollout_buffer_free(r) with timing.add_time('tensors_gpu_float'): device_buffer = self._copy_train_data_to_device(buffer) with timing.add_time('squeeze'): # will squeeze actions only in simple categorical case tensors_to_squeeze = [ 'actions', 'log_prob_actions', 'policy_version', 'values', 'rewards', 'dones' ] for tensor_name in tensors_to_squeeze: device_buffer[tensor_name].squeeze_() # we no longer need the cached buffer, and can put it back into the pool self.tensor_batch_pool.put(buffer) return device_buffer
def enjoy(cfg, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) # env.seed(0) is_multiagent = is_multiagent_env(env) if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints( LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)] num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() rnn_states = torch.zeros( [env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = np.zeros(env.num_agents) finished_episode = [False] * env.num_agents with torch.no_grad(): while not max_frames_reached(num_frames): obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) # sample actions from the distribution by default actions = policy_outputs.actions action_distribution = policy_outputs.action_distribution if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += rew num_frames += 1 for agent_i, done_flag in enumerate(done): if done_flag: finished_episode[agent_i] = True episode_rewards[agent_i].append( episode_reward[agent_i]) true_rewards[agent_i].append(infos[agent_i].get( 'true_reward', math.nan)) log.info( 'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f', agent_i, num_frames, episode_reward[agent_i], true_rewards[agent_i][-1]) rnn_states[agent_i] = torch.zeros( [get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward[agent_i] = 0 # if episode terminated synchronously for all agents, pause a bit before starting a new one if all(done): if not cfg.no_render: env.render() time.sleep(0.05) if all(finished_episode): finished_episode = [False] * env.num_agents avg_episode_rewards_str, avg_true_reward_str = '', '' for agent_i in range(env.num_agents): avg_rew = np.mean(episode_rewards[agent_i]) avg_true_rew = np.mean(true_rewards[agent_i]) if not np.isnan(avg_rew): if avg_episode_rewards_str: avg_episode_rewards_str += ', ' avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}' if not np.isnan(avg_true_rew): if avg_true_reward_str: avg_true_reward_str += ', ' avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}' log.info('Avg episode rewards: %s, true rewards: %s', avg_episode_rewards_str, avg_true_reward_str) log.info( 'Avg episode reward: %.3f, avg true_reward: %.3f', np.mean([ np.mean(episode_rewards[i]) for i in range(env.num_agents) ]), np.mean([ np.mean(true_rewards[i]) for i in range(env.num_agents) ])) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def _record_summaries(self, train_loop_vars): var = train_loop_vars self.last_summary_time = time.time() stats = AttrDict() grad_norm = sum( p.grad.data.norm(2).item()**2 for p in self.actor_critic.parameters() if p.grad is not None)**0.5 stats.grad_norm = grad_norm stats.loss = var.loss stats.value = var.result.values.mean() stats.entropy = var.action_distribution.entropy().mean() stats.policy_loss = var.policy_loss stats.value_loss = var.value_loss stats.entropy_loss = var.entropy_loss stats.adv_min = var.adv.min() stats.adv_max = var.adv.max() stats.adv_std = var.adv_std stats.max_abs_logprob = torch.abs(var.mb.action_logits).max() if hasattr(var.action_distribution, 'summaries'): stats.update(var.action_distribution.summaries()) if var.epoch == self.cfg.ppo_epochs - 1 and var.batch_num == len( var.minibatches) - 1: # we collect these stats only for the last PPO batch, or every time if we're only doing one batch, IMPALA-style ratio_mean = torch.abs(1.0 - var.ratio).mean().detach() ratio_min = var.ratio.min().detach() ratio_max = var.ratio.max().detach() # log.debug('Learner %d ratio mean min max %.4f %.4f %.4f', self.policy_id, ratio_mean.cpu().item(), ratio_min.cpu().item(), ratio_max.cpu().item()) value_delta = torch.abs(var.values - var.old_values) value_delta_avg, value_delta_max = value_delta.mean( ), value_delta.max() # calculate KL-divergence with the behaviour policy action distribution old_action_distribution = get_action_distribution( self.actor_critic.action_space, var.mb.action_logits, ) kl_old = var.action_distribution.kl_divergence( old_action_distribution) kl_old_mean = kl_old.mean() stats.kl_divergence = kl_old_mean stats.value_delta = value_delta_avg stats.value_delta_max = value_delta_max stats.fraction_clipped = ( (var.ratio < var.clip_ratio_low).float() + (var.ratio > var.clip_ratio_high).float()).mean() stats.ratio_mean = ratio_mean stats.ratio_min = ratio_min stats.ratio_max = ratio_max stats.num_sgd_steps = var.num_sgd_steps # this caused numerical issues on some versions of PyTorch with second moment reaching infinity adam_max_second_moment = 0.0 for key, tensor_state in self.optimizer.state.items(): adam_max_second_moment = max( tensor_state['exp_avg_sq'].max().item(), adam_max_second_moment) stats.adam_max_second_moment = adam_max_second_moment version_diff = var.curr_policy_version - var.mb.policy_version stats.version_diff_avg = version_diff.mean() stats.version_diff_min = version_diff.min() stats.version_diff_max = version_diff.max() for key, value in stats.items(): stats[key] = to_scalar(value) return stats
def enjoy(cfg, max_num_episodes=1000000, max_num_frames=1e9): cfg = load_from_checkpoint(cfg) render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip if render_action_repeat is None: log.warning('Not using action repeat!') render_action_repeat = 1 log.debug('Using action repeat %d during evaluation', render_action_repeat) cfg.env_frameskip = 1 # for evaluation cfg.num_envs = 1 if cfg.record_to: tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S') cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp) if not os.path.isdir(cfg.record_to): os.makedirs(cfg.record_to) else: cfg.record_to = None def make_env_func(env_config): return create_env(cfg.env, cfg=cfg, env_config=env_config) env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0})) env.seed(0) is_multiagent = hasattr(env, 'num_agents') and env.num_agents > 1 if not is_multiagent: env = MultiAgentWrapper(env) if hasattr(env.unwrapped, 'reset_on_init'): # reset call ruins the demo recording for VizDoom env.unwrapped.reset_on_init = False actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda') actor_critic.model_to_device(device) policy_id = cfg.policy_index checkpoints = LearnerWorker.get_checkpoints(LearnerWorker.checkpoint_dir(cfg, policy_id)) checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device) actor_critic.load_state_dict(checkpoint_dict['model']) episode_rewards = [] true_rewards = deque([], maxlen=100) num_frames = 0 last_render_start = time.time() def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames obs = env.reset() with torch.no_grad(): for _ in range(max_num_episodes): done = [False] * len(obs) rnn_states = torch.zeros([env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device) episode_reward = 0 while True: obs_torch = AttrDict(transform_dict_observations(obs)) for key, x in obs_torch.items(): obs_torch[key] = torch.from_numpy(x).to(device).float() policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True) action_distribution = policy_outputs.action_distribution # sample actions from the distribution by default actions = policy_outputs.actions if isinstance(action_distribution, ContinuousActionDistribution): if not cfg.continuous_actions_sample: # TODO: add similar option for discrete actions actions = action_distribution.means actions = actions.cpu().numpy() rnn_states = policy_outputs.rnn_states for _ in range(render_action_repeat): if not cfg.no_render: target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0 current_delay = time.time() - last_render_start time_wait = target_delay - current_delay if time_wait > 0: # log.info('Wait time %.3f', time_wait) time.sleep(time_wait) last_render_start = time.time() env.render() obs, rew, done, infos = env.step(actions) episode_reward += np.mean(rew) num_frames += 1 if all(done): true_rewards.append(infos[0].get('true_reward', math.nan)) log.info('Episode finished at %d frames', num_frames) if not math.isnan(np.mean(true_rewards)): log.info('true rew %.3f avg true rew %.3f', true_rewards[-1], np.mean(true_rewards)) # VizDoom multiplayer stuff # for player in [1, 2, 3, 4, 5, 6, 7, 8]: # key = f'PLAYER{player}_FRAGCOUNT' # if key in infos[0]: # log.debug('Score for player %d: %r', player, infos[0][key]) break if all(done) or max_frames_reached(num_frames): break if not cfg.no_render: env.render() time.sleep(0.01) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames): break env.close() return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
def _handle_policy_steps(self, timing): with torch.no_grad(): with timing.add_time('deserialize'): observations = AttrDict() rnn_states = [] option_idxs = [] traj_tensors = self.shared_buffers.tensors_individual_transitions for request in self.requests: actor_idx, split_idx, request_data = request for env_idx, agent_idx, traj_buffer_idx, rollout_step in request_data: index = actor_idx, split_idx, env_idx, agent_idx, traj_buffer_idx, rollout_step dict_of_lists_append(observations, traj_tensors['obs'], index) rnn_states.append(traj_tensors['rnn_states'][index]) option_idxs.append(traj_tensors['option_idx'][index]) self.total_num_samples += 1 with timing.add_time('stack'): for key, x in observations.items(): observations[key] = torch.stack(x) rnn_states = torch.stack(rnn_states) option_idxs = torch.stack(option_idxs) num_samples = rnn_states.shape[0] self.samples_per_step = num_samples with timing.add_time('obs_to_device'): for key, x in observations.items(): device, dtype = self.actor_critic.device_and_type_for_input_tensor( key) observations[key] = x.to(device).type(dtype) rnn_states = rnn_states.to(self.device).float() option_idxs = option_idxs.to(self.device).float() with timing.add_time('forward'): policy_outputs = self.actor_critic(observations, rnn_states, option_idxs=option_idxs, acting=True) with timing.add_time('to_cpu'): for key, output_value in policy_outputs.items(): policy_outputs[key] = output_value.cpu() with timing.add_time('format_outputs'): policy_outputs.policy_version = torch.empty( [num_samples]).fill_(self.latest_policy_version) # concat all tensors into a single tensor for performance output_tensors = [] for policy_output in self.shared_buffers.policy_outputs: tensor_name = policy_output.name output_value = policy_outputs[tensor_name].float() # if tensor_name == 'actions': # # actions: (B * O) x num_actions -> B x (num_actions * O) # num_actions = self.shared_buffers.num_actions # output_value = output_value.reshape(-1, self.cfg.num_options, num_actions) # elif tensor_name == 'action_logits': # num_action_logits = self.shared_buffers.num_action_logits # # action_logits: (B * O) x num_action_logits -> B x (num_action_logits * O) # output_value = output_value.reshape(-1, self.cfg.num_options, num_action_logits) # elif tensor_name == 'log_prob_actions': # # log_prob_actions: (B * O) x 1 -> B x (1 * O) # output_value = output_value.reshape(-1, self.cfg.num_options, 1) if tensor_name in [ 'actions', 'action_logits', 'log_prob_actions' ]: output_value = output_value.reshape( -1, policy_output.size) if len(output_value.shape) == 1: output_value.unsqueeze_(dim=1) output_tensors.append(output_value) output_tensors = torch.cat(output_tensors, dim=1) with timing.add_time('postprocess'): self._enqueue_policy_outputs(self.requests, output_tensors) self.requests = []