def _handle_policy_steps(self, timing):
        with torch.no_grad():
            with timing.add_time('deserialize'):
                observations = AttrDict()
                rnn_states = []

                traj_tensors = self.shared_buffers.tensors_individual_transitions
                for request in self.requests:
                    actor_idx, split_idx, request_data = request

                    for env_idx, agent_idx, traj_buffer_idx, rollout_step in request_data:
                        index = actor_idx, split_idx, env_idx, agent_idx, traj_buffer_idx, rollout_step
                        dict_of_lists_append(observations, traj_tensors['obs'],
                                             index)
                        rnn_states.append(traj_tensors['rnn_states'][index])
                        self.total_num_samples += 1

            with timing.add_time('stack'):
                for key, x in observations.items():
                    observations[key] = torch.stack(x)
                rnn_states = torch.stack(rnn_states)
                num_samples = rnn_states.shape[0]

            with timing.add_time('obs_to_device'):
                for key, x in observations.items():
                    device, dtype = self.actor_critic.device_and_type_for_input_tensor(
                        key)
                    observations[key] = x.to(device).type(dtype)
                rnn_states = rnn_states.to(self.device).float()

            with timing.add_time('forward'):
                policy_outputs = self.actor_critic(observations, rnn_states)

            with timing.add_time('to_cpu'):
                for key, output_value in policy_outputs.items():
                    policy_outputs[key] = output_value.cpu()

            with timing.add_time('format_outputs'):
                policy_outputs.policy_version = torch.empty(
                    [num_samples]).fill_(self.latest_policy_version)

                # concat all tensors into a single tensor for performance
                output_tensors = []
                for policy_output in self.shared_buffers.policy_outputs:
                    tensor_name = policy_output.name
                    output_value = policy_outputs[tensor_name].float()
                    if len(output_value.shape) == 1:
                        output_value.unsqueeze_(dim=1)
                    output_tensors.append(output_value)

                output_tensors = torch.cat(output_tensors, dim=1)

            with timing.add_time('postprocess'):
                self._enqueue_policy_outputs(self.requests, output_tensors)

        self.requests = []
Example #2
0
    def _prepare_train_buffer(self, rollouts, macro_batch_size, timing):
        trajectories = [AttrDict(r['t']) for r in rollouts]

        with timing.add_time('buffers'):
            buffer = AttrDict()

            # by the end of this loop the buffer is a dictionary containing lists of numpy arrays
            for i, t in enumerate(trajectories):
                for key, x in t.items():
                    if key not in buffer:
                        buffer[key] = []
                    buffer[key].append(x)

            # convert lists of dict observations to a single dictionary of lists
            for key, x in buffer.items():
                if isinstance(x[0], (dict, OrderedDict)):
                    buffer[key] = list_of_dicts_to_dict_of_lists(x)

        if not self.cfg.with_vtrace:
            with timing.add_time('calc_gae'):
                buffer = self._calculate_gae(buffer)

        with timing.add_time('batching'):
            # concatenate rollouts from different workers into a single batch efficiently
            # that is, if we already have memory for the buffers allocated, we can just copy the data into
            # existing cached tensors instead of creating new ones. This is a performance optimization.
            use_pinned_memory = self.cfg.device == 'gpu'
            buffer = self.tensor_batcher.cat(buffer, macro_batch_size,
                                             use_pinned_memory, timing)

        with timing.add_time('buff_ready'):
            for r in rollouts:
                self._mark_rollout_buffer_free(r)

        with timing.add_time('tensors_gpu_float'):
            device_buffer = self._copy_train_data_to_device(buffer)

        with timing.add_time('squeeze'):
            # will squeeze actions only in simple categorical case
            tensors_to_squeeze = [
                'actions', 'log_prob_actions', 'policy_version', 'values',
                'rewards', 'dones'
            ]
            for tensor_name in tensors_to_squeeze:
                device_buffer[tensor_name].squeeze_()

        # we no longer need the cached buffer, and can put it back into the pool
        self.tensor_batch_pool.put(buffer)
        return device_buffer
def enjoy(cfg, max_num_frames=1e9):
    cfg = load_from_checkpoint(cfg)

    render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip
    if render_action_repeat is None:
        log.warning('Not using action repeat!')
        render_action_repeat = 1
    log.debug('Using action repeat %d during evaluation', render_action_repeat)

    cfg.env_frameskip = 1  # for evaluation
    cfg.num_envs = 1

    if cfg.record_to:
        tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
        cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp)
        if not os.path.isdir(cfg.record_to):
            os.makedirs(cfg.record_to)
    else:
        cfg.record_to = None

    def make_env_func(env_config):
        return create_env(cfg.env, cfg=cfg, env_config=env_config)

    env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0}))
    # env.seed(0)

    is_multiagent = is_multiagent_env(env)
    if not is_multiagent:
        env = MultiAgentWrapper(env)

    if hasattr(env.unwrapped, 'reset_on_init'):
        # reset call ruins the demo recording for VizDoom
        env.unwrapped.reset_on_init = False

    actor_critic = create_actor_critic(cfg, env.observation_space,
                                       env.action_space)

    device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda')
    actor_critic.model_to_device(device)

    policy_id = cfg.policy_index
    checkpoints = LearnerWorker.get_checkpoints(
        LearnerWorker.checkpoint_dir(cfg, policy_id))
    checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device)
    actor_critic.load_state_dict(checkpoint_dict['model'])

    episode_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)]
    true_rewards = [deque([], maxlen=100) for _ in range(env.num_agents)]
    num_frames = 0

    last_render_start = time.time()

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    obs = env.reset()
    rnn_states = torch.zeros(
        [env.num_agents, get_hidden_size(cfg)],
        dtype=torch.float32,
        device=device)
    episode_reward = np.zeros(env.num_agents)
    finished_episode = [False] * env.num_agents

    with torch.no_grad():
        while not max_frames_reached(num_frames):
            obs_torch = AttrDict(transform_dict_observations(obs))
            for key, x in obs_torch.items():
                obs_torch[key] = torch.from_numpy(x).to(device).float()

            policy_outputs = actor_critic(obs_torch,
                                          rnn_states,
                                          with_action_distribution=True)

            # sample actions from the distribution by default
            actions = policy_outputs.actions

            action_distribution = policy_outputs.action_distribution
            if isinstance(action_distribution, ContinuousActionDistribution):
                if not cfg.continuous_actions_sample:  # TODO: add similar option for discrete actions
                    actions = action_distribution.means

            actions = actions.cpu().numpy()

            rnn_states = policy_outputs.rnn_states

            for _ in range(render_action_repeat):
                if not cfg.no_render:
                    target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0
                    current_delay = time.time() - last_render_start
                    time_wait = target_delay - current_delay

                    if time_wait > 0:
                        # log.info('Wait time %.3f', time_wait)
                        time.sleep(time_wait)

                    last_render_start = time.time()
                    env.render()

                obs, rew, done, infos = env.step(actions)

                episode_reward += rew
                num_frames += 1

                for agent_i, done_flag in enumerate(done):
                    if done_flag:
                        finished_episode[agent_i] = True
                        episode_rewards[agent_i].append(
                            episode_reward[agent_i])
                        true_rewards[agent_i].append(infos[agent_i].get(
                            'true_reward', math.nan))
                        log.info(
                            'Episode finished for agent %d at %d frames. Reward: %.3f, true_reward: %.3f',
                            agent_i, num_frames, episode_reward[agent_i],
                            true_rewards[agent_i][-1])
                        rnn_states[agent_i] = torch.zeros(
                            [get_hidden_size(cfg)],
                            dtype=torch.float32,
                            device=device)
                        episode_reward[agent_i] = 0

                # if episode terminated synchronously for all agents, pause a bit before starting a new one
                if all(done):
                    if not cfg.no_render:
                        env.render()
                    time.sleep(0.05)

                if all(finished_episode):
                    finished_episode = [False] * env.num_agents
                    avg_episode_rewards_str, avg_true_reward_str = '', ''
                    for agent_i in range(env.num_agents):
                        avg_rew = np.mean(episode_rewards[agent_i])
                        avg_true_rew = np.mean(true_rewards[agent_i])
                        if not np.isnan(avg_rew):
                            if avg_episode_rewards_str:
                                avg_episode_rewards_str += ', '
                            avg_episode_rewards_str += f'#{agent_i}: {avg_rew:.3f}'
                        if not np.isnan(avg_true_rew):
                            if avg_true_reward_str:
                                avg_true_reward_str += ', '
                            avg_true_reward_str += f'#{agent_i}: {avg_true_rew:.3f}'

                    log.info('Avg episode rewards: %s, true rewards: %s',
                             avg_episode_rewards_str, avg_true_reward_str)
                    log.info(
                        'Avg episode reward: %.3f, avg true_reward: %.3f',
                        np.mean([
                            np.mean(episode_rewards[i])
                            for i in range(env.num_agents)
                        ]),
                        np.mean([
                            np.mean(true_rewards[i])
                            for i in range(env.num_agents)
                        ]))

                # VizDoom multiplayer stuff
                # for player in [1, 2, 3, 4, 5, 6, 7, 8]:
                #     key = f'PLAYER{player}_FRAGCOUNT'
                #     if key in infos[0]:
                #         log.debug('Score for player %d: %r', player, infos[0][key])

    env.close()

    return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
Example #4
0
    def _record_summaries(self, train_loop_vars):
        var = train_loop_vars

        self.last_summary_time = time.time()
        stats = AttrDict()

        grad_norm = sum(
            p.grad.data.norm(2).item()**2
            for p in self.actor_critic.parameters() if p.grad is not None)**0.5
        stats.grad_norm = grad_norm
        stats.loss = var.loss
        stats.value = var.result.values.mean()
        stats.entropy = var.action_distribution.entropy().mean()
        stats.policy_loss = var.policy_loss
        stats.value_loss = var.value_loss
        stats.entropy_loss = var.entropy_loss
        stats.adv_min = var.adv.min()
        stats.adv_max = var.adv.max()
        stats.adv_std = var.adv_std
        stats.max_abs_logprob = torch.abs(var.mb.action_logits).max()

        if hasattr(var.action_distribution, 'summaries'):
            stats.update(var.action_distribution.summaries())

        if var.epoch == self.cfg.ppo_epochs - 1 and var.batch_num == len(
                var.minibatches) - 1:
            # we collect these stats only for the last PPO batch, or every time if we're only doing one batch, IMPALA-style
            ratio_mean = torch.abs(1.0 - var.ratio).mean().detach()
            ratio_min = var.ratio.min().detach()
            ratio_max = var.ratio.max().detach()
            # log.debug('Learner %d ratio mean min max %.4f %.4f %.4f', self.policy_id, ratio_mean.cpu().item(), ratio_min.cpu().item(), ratio_max.cpu().item())

            value_delta = torch.abs(var.values - var.old_values)
            value_delta_avg, value_delta_max = value_delta.mean(
            ), value_delta.max()

            # calculate KL-divergence with the behaviour policy action distribution
            old_action_distribution = get_action_distribution(
                self.actor_critic.action_space,
                var.mb.action_logits,
            )
            kl_old = var.action_distribution.kl_divergence(
                old_action_distribution)
            kl_old_mean = kl_old.mean()

            stats.kl_divergence = kl_old_mean
            stats.value_delta = value_delta_avg
            stats.value_delta_max = value_delta_max
            stats.fraction_clipped = (
                (var.ratio < var.clip_ratio_low).float() +
                (var.ratio > var.clip_ratio_high).float()).mean()
            stats.ratio_mean = ratio_mean
            stats.ratio_min = ratio_min
            stats.ratio_max = ratio_max
            stats.num_sgd_steps = var.num_sgd_steps

        # this caused numerical issues on some versions of PyTorch with second moment reaching infinity
        adam_max_second_moment = 0.0
        for key, tensor_state in self.optimizer.state.items():
            adam_max_second_moment = max(
                tensor_state['exp_avg_sq'].max().item(),
                adam_max_second_moment)
        stats.adam_max_second_moment = adam_max_second_moment

        version_diff = var.curr_policy_version - var.mb.policy_version
        stats.version_diff_avg = version_diff.mean()
        stats.version_diff_min = version_diff.min()
        stats.version_diff_max = version_diff.max()

        for key, value in stats.items():
            stats[key] = to_scalar(value)

        return stats
Example #5
0
def enjoy(cfg, max_num_episodes=1000000, max_num_frames=1e9):
    cfg = load_from_checkpoint(cfg)

    render_action_repeat = cfg.render_action_repeat if cfg.render_action_repeat is not None else cfg.env_frameskip
    if render_action_repeat is None:
        log.warning('Not using action repeat!')
        render_action_repeat = 1
    log.debug('Using action repeat %d during evaluation', render_action_repeat)

    cfg.env_frameskip = 1  # for evaluation
    cfg.num_envs = 1

    if cfg.record_to:
        tstamp = datetime.datetime.now().strftime('%Y_%m_%d__%H_%M_%S')
        cfg.record_to = join(cfg.record_to, f'{cfg.experiment}', tstamp)
        if not os.path.isdir(cfg.record_to):
            os.makedirs(cfg.record_to)
    else:
        cfg.record_to = None

    def make_env_func(env_config):
        return create_env(cfg.env, cfg=cfg, env_config=env_config)

    env = make_env_func(AttrDict({'worker_index': 0, 'vector_index': 0}))
    env.seed(0)

    is_multiagent = hasattr(env, 'num_agents') and env.num_agents > 1
    if not is_multiagent:
        env = MultiAgentWrapper(env)

    if hasattr(env.unwrapped, 'reset_on_init'):
        # reset call ruins the demo recording for VizDoom
        env.unwrapped.reset_on_init = False

    actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space)

    device = torch.device('cpu' if cfg.device == 'cpu' else 'cuda')
    actor_critic.model_to_device(device)

    policy_id = cfg.policy_index
    checkpoints = LearnerWorker.get_checkpoints(LearnerWorker.checkpoint_dir(cfg, policy_id))
    checkpoint_dict = LearnerWorker.load_checkpoint(checkpoints, device)
    actor_critic.load_state_dict(checkpoint_dict['model'])

    episode_rewards = []
    true_rewards = deque([], maxlen=100)
    num_frames = 0

    last_render_start = time.time()

    def max_frames_reached(frames):
        return max_num_frames is not None and frames > max_num_frames

    obs = env.reset()

    with torch.no_grad():
        for _ in range(max_num_episodes):
            done = [False] * len(obs)
            rnn_states = torch.zeros([env.num_agents, get_hidden_size(cfg)], dtype=torch.float32, device=device)

            episode_reward = 0

            while True:
                obs_torch = AttrDict(transform_dict_observations(obs))
                for key, x in obs_torch.items():
                    obs_torch[key] = torch.from_numpy(x).to(device).float()

                policy_outputs = actor_critic(obs_torch, rnn_states, with_action_distribution=True)

                action_distribution = policy_outputs.action_distribution

                # sample actions from the distribution by default
                actions = policy_outputs.actions

                if isinstance(action_distribution, ContinuousActionDistribution):
                    if not cfg.continuous_actions_sample:  # TODO: add similar option for discrete actions
                        actions = action_distribution.means

                actions = actions.cpu().numpy()

                rnn_states = policy_outputs.rnn_states

                for _ in range(render_action_repeat):
                    if not cfg.no_render:
                        target_delay = 1.0 / cfg.fps if cfg.fps > 0 else 0
                        current_delay = time.time() - last_render_start
                        time_wait = target_delay - current_delay

                        if time_wait > 0:
                            # log.info('Wait time %.3f', time_wait)
                            time.sleep(time_wait)

                        last_render_start = time.time()
                        env.render()

                    obs, rew, done, infos = env.step(actions)

                    episode_reward += np.mean(rew)
                    num_frames += 1

                    if all(done):
                        true_rewards.append(infos[0].get('true_reward', math.nan))
                        log.info('Episode finished at %d frames', num_frames)
                        if not math.isnan(np.mean(true_rewards)):
                            log.info('true rew %.3f avg true rew %.3f', true_rewards[-1], np.mean(true_rewards))

                        # VizDoom multiplayer stuff
                        # for player in [1, 2, 3, 4, 5, 6, 7, 8]:
                        #     key = f'PLAYER{player}_FRAGCOUNT'
                        #     if key in infos[0]:
                        #         log.debug('Score for player %d: %r', player, infos[0][key])
                        break

                if all(done) or max_frames_reached(num_frames):
                    break

            if not cfg.no_render:
                env.render()
            time.sleep(0.01)

            episode_rewards.append(episode_reward)
            last_episodes = episode_rewards[-100:]
            avg_reward = sum(last_episodes) / len(last_episodes)
            log.info(
                'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward,
            )

            if max_frames_reached(num_frames):
                break

    env.close()

    return ExperimentStatus.SUCCESS, np.mean(episode_rewards)
Example #6
0
    def _handle_policy_steps(self, timing):
        with torch.no_grad():
            with timing.add_time('deserialize'):
                observations = AttrDict()
                rnn_states = []
                option_idxs = []

                traj_tensors = self.shared_buffers.tensors_individual_transitions
                for request in self.requests:
                    actor_idx, split_idx, request_data = request

                    for env_idx, agent_idx, traj_buffer_idx, rollout_step in request_data:
                        index = actor_idx, split_idx, env_idx, agent_idx, traj_buffer_idx, rollout_step
                        dict_of_lists_append(observations, traj_tensors['obs'],
                                             index)
                        rnn_states.append(traj_tensors['rnn_states'][index])
                        option_idxs.append(traj_tensors['option_idx'][index])
                        self.total_num_samples += 1

            with timing.add_time('stack'):
                for key, x in observations.items():
                    observations[key] = torch.stack(x)
                rnn_states = torch.stack(rnn_states)
                option_idxs = torch.stack(option_idxs)
                num_samples = rnn_states.shape[0]
                self.samples_per_step = num_samples

            with timing.add_time('obs_to_device'):
                for key, x in observations.items():
                    device, dtype = self.actor_critic.device_and_type_for_input_tensor(
                        key)
                    observations[key] = x.to(device).type(dtype)
                rnn_states = rnn_states.to(self.device).float()
                option_idxs = option_idxs.to(self.device).float()

            with timing.add_time('forward'):
                policy_outputs = self.actor_critic(observations,
                                                   rnn_states,
                                                   option_idxs=option_idxs,
                                                   acting=True)

            with timing.add_time('to_cpu'):
                for key, output_value in policy_outputs.items():
                    policy_outputs[key] = output_value.cpu()

            with timing.add_time('format_outputs'):
                policy_outputs.policy_version = torch.empty(
                    [num_samples]).fill_(self.latest_policy_version)

                # concat all tensors into a single tensor for performance
                output_tensors = []
                for policy_output in self.shared_buffers.policy_outputs:
                    tensor_name = policy_output.name
                    output_value = policy_outputs[tensor_name].float()
                    # if tensor_name == 'actions':
                    #     # actions: (B * O) x num_actions -> B x (num_actions * O)
                    #     num_actions = self.shared_buffers.num_actions
                    #     output_value = output_value.reshape(-1, self.cfg.num_options, num_actions)
                    # elif tensor_name == 'action_logits':
                    #     num_action_logits = self.shared_buffers.num_action_logits
                    #     # action_logits: (B * O) x num_action_logits -> B x (num_action_logits * O)
                    #     output_value = output_value.reshape(-1, self.cfg.num_options, num_action_logits)
                    # elif tensor_name == 'log_prob_actions':
                    #     # log_prob_actions: (B * O) x 1 -> B x (1 * O)
                    #     output_value = output_value.reshape(-1, self.cfg.num_options, 1)
                    if tensor_name in [
                            'actions', 'action_logits', 'log_prob_actions'
                    ]:
                        output_value = output_value.reshape(
                            -1, policy_output.size)
                    if len(output_value.shape) == 1:
                        output_value.unsqueeze_(dim=1)
                    output_tensors.append(output_value)

                output_tensors = torch.cat(output_tensors, dim=1)

            with timing.add_time('postprocess'):
                self._enqueue_policy_outputs(self.requests, output_tensors)

        self.requests = []