def log_diagnostics(self, paths):
        BatchPolopt.log_diagnostics(self, paths)
        self.sampler.log_diagnostics(paths)

        if self.policy.latent_dim:

            if self.log_individual_latents and not self.policy.resample:  # this is only valid for finite discrete latents!!
                all_latent_avg_returns = []
                clustered_by_latents = collections.OrderedDict()  # this could be done within the distribution to be more general, but ugly
                for lat_key in range(self.policy.latent_dim):
                    clustered_by_latents[lat_key] = []
                for path in paths:
                    lat = path['agent_infos']['latents'][0]
                    lat_key = int(from_onehot(lat))  # from_onehot returns an axis less than the input.
                    clustered_by_latents[lat_key].append(path)

                for latent_key, paths in clustered_by_latents.items():  # what to do if this is empty?? set a default!
                    with logger.tabular_prefix(str(latent_key)), logger.prefix(str(latent_key)):
                        if paths:
                            undiscounted_rewards = [sum(path["true_rewards"]) for path in paths]
                        else:
                            undiscounted_rewards = [0]
                        all_latent_avg_returns.append(np.mean(undiscounted_rewards))
                        logger.record_tabular('Avg_TrueReturn', np.mean(undiscounted_rewards))
                        logger.record_tabular('Std_TrueReturn', np.std(undiscounted_rewards))
                        logger.record_tabular('Max_TrueReturn', np.max(undiscounted_rewards))
                        if self.log_deterministic:
                            lat = from_index(latent_key, self.policy.latent_dim)
                            with self.policy.fix_latent(lat), self.policy.set_std_to_0():
                                path_det = rollout(self.env, self.policy, self.max_path_length)
                                logger.record_tabular('Deterministic_TrueReturn', np.sum(path_det["rewards"]))

                with logger.tabular_prefix('all_lat_'), logger.prefix('all_lat_'):
                    logger.record_tabular('MaxAvgReturn', np.max(all_latent_avg_returns))
                    logger.record_tabular('MinAvgReturn', np.min(all_latent_avg_returns))
                    logger.record_tabular('StdAvgReturn', np.std(all_latent_avg_returns))

                if self.log_hierarchy:
                    max_in_path_length = 10
                    completed_in_paths = 0
                    path = rollout(self.env, self.policy, max_path_length=max_in_path_length, animated=False)
                    if len(path['rewards']) == max_in_path_length:
                        completed_in_paths += 1
                        for t in range(1, 50):
                            path = rollout(self.env, self.policy, max_path_length=10, animated=False,
                                           reset_start_rollout=False)
                            if len(path['rewards']) < 10:
                                break
                            completed_in_paths += 1
                    logger.record_tabular('Hierarchy', completed_in_paths)

        else:
            if self.log_deterministic:
                with self.policy.set_std_to_0():
                    path = rollout(self.env, self.policy, self.max_path_length)
                logger.record_tabular('Deterministic_TrueReturn', np.sum(path["rewards"]))
Exemple #2
0
    def step(self, action):
        #print('!!!!!!!!!!!', self.time_steps_agg)
        action = self.action_space.flatten(action)
        with self.low_policy.fix_latent(action):
            # print("From hier_snn_env --> the hier action is prefixed latent: {}".format(self.low_policy.pre_fix_latent))
            if isinstance(self.wrapped_env, FastMazeEnv):
                with self.wrapped_env.blank_maze():
                    frac_path = rollout(self.wrapped_env, self.low_policy, max_path_length=self.time_steps_agg,
                                        reset_start_rollout=False, keep_rendered_rgbs=self.keep_rendered_rgb,
                                        animated=self.animate, speedup=1000)
                next_obs = self.wrapped_env.get_current_obs()
                #next_obs = frac_path['observations'][-1]
            elif isinstance(self.wrapped_env, NormalizedEnv) and isinstance(self.wrapped_env.wrapped_env, FastMazeEnv):
                with self.wrapped_env.wrapped_env.blank_maze():
                    # print("max_path_length", self.time_steps_agg)
                    frac_path = rollout(self.wrapped_env, self.low_policy, max_path_length=self.time_steps_agg,
                                        reset_start_rollout=False, keep_rendered_rgbs=self.keep_rendered_rgb,
                                        animated=self.animate, speedup=1000)
                    # print("low_policy", self.low_policy)
                next_obs = self.wrapped_env.wrapped_env.get_current_obs()
                #next_obs = frac_path['observations'][-1]
                # print("wrapped_env", self.wrapped_env.wrapped_env)
                # print("next_obs_hier", next_obs.shape)
            else:
                frac_path = rollout(self.wrapped_env, self.low_policy, max_path_length=self.time_steps_agg,
                                    reset_start_rollout=False, keep_rendered_rgbs=self.keep_rendered_rgb,
                                    animated=self.animate, speedup=1000)
                next_obs = frac_path['observations'][-1]

            reward = np.sum(frac_path['rewards'])
            terminated = frac_path['terminated'][-1]
            done = self.time_steps_agg > len(
                frac_path['observations']) or terminated  # if the rollout was not maximal it was "done"
            # it would be better to add an extra flagg to this rollout to check if it was done in the last step
            last_agent_info = dict((k, val[-1]) for k, val in frac_path['agent_infos'].items())
            last_env_info = dict((k, val[-1]) for k, val in frac_path['env_infos'].items())
        # print("finished step of {}, with cummulated reward of: {}".format(len(frac_path['observations']), reward))
        if done:
            # if done I need to PAD the tensor so there is no mismatch. Pad with the last elem, but not the env_infos!
            # still padding env_infos, because env_infos not infect training
            frac_path['env_infos'] = tensor_utils.pad_tensor_dict(frac_path['env_infos'], self.time_steps_agg)
            # full_path = tensor_utils.pad_tensor_dict(frac_path, self.time_steps_agg, mode='last')
            # # you might be padding the rewards
            # actual_path_length = len(frac_path['rewards'])
            # full_path['rewards'][actual_path_length:] = 0.
            # print("no padding")
            full_path = frac_path
        else:
            full_path = frac_path
        # print("last_env_info", last_env_info)
        # print("last_agent_info", last_agent_info)
        # print("full_path", full_path)
        return Step(next_obs, reward, done,
                    last_env_info=last_env_info, last_agent_info=last_agent_info, full_path=full_path)
Exemple #3
0
    def step(self, action):
        action = self.action_space.flatten(action)
        with self.low_policy.fix_selector(action):
            # print("The hier action is prefixed selector: {}".format(self.low_policy.pre_fix_selector))
            if isinstance(self.wrapped_env, FastMazeEnv):
                with self.wrapped_env.blank_maze():
                    frac_path = rollout(
                        self.wrapped_env,
                        self.low_policy,
                        max_path_length=self.time_steps_agg,
                        reset_start_rollout=False,
                        keep_rendered_rgbs=self.keep_rendered_rgb,
                        animated=self.animate,
                        speedup=1000)
                next_obs = self.wrapped_env.get_current_obs()
            elif isinstance(self.wrapped_env, NormalizedEnv) and isinstance(
                    self.wrapped_env.wrapped_env, FastMazeEnv):
                with self.wrapped_env.wrapped_env.blank_maze():
                    frac_path = rollout(
                        self.wrapped_env,
                        self.low_policy,
                        max_path_length=self.time_steps_agg,
                        reset_start_rollout=False,
                        keep_rendered_rgbs=self.keep_rendered_rgb,
                        animated=self.animate,
                        speedup=1000)
                next_obs = self.wrapped_env.wrapped_env.get_current_obs()
            else:
                frac_path = rollout(self.wrapped_env,
                                    self.low_policy,
                                    max_path_length=self.time_steps_agg,
                                    reset_start_rollout=False,
                                    keep_rendered_rgbs=self.keep_rendered_rgb,
                                    animated=self.animate,
                                    speedup=1000)
                next_obs = frac_path['observations'][-1]
            reward = np.sum(frac_path['rewards'])
            terminated = frac_path['terminated'][-1]
            done = self.time_steps_agg > len(
                frac_path['observations']
            ) or terminated  # if the rollout was not maximal it was "done"
            # it would be better to add an extra flagg to this rollout to check if it was done in the last step
            last_agent_info = dict(
                (k, val[-1]) for k, val in frac_path['agent_infos'].items())
            last_env_info = dict(
                (k, val[-1]) for k, val in frac_path['env_infos'].items())
        # print("finished step of {}, with cummulated reward of: {}".format(len(frac_path['observations']), reward))
        if done:
            # if done I need to PAD the tensor so there is no mismatch. Pad with the last elem
            full_path = tensor_utils.pad_tensor_dict(frac_path,
                                                     self.time_steps_agg,
                                                     mode='last')
        else:
            full_path = frac_path

        return Step(next_obs,
                    reward,
                    done,
                    last_env_info=last_env_info,
                    last_agent_info=last_agent_info,
                    full_path=full_path)