Ejemplo n.º 1
0
    def _log_infos(self, traj_infos=None):
        if traj_infos is None:
            traj_infos = self._traj_infos
        if traj_infos:
            for k in traj_infos[0]:
                if not k.startswith("_"):
                    logger.record_tabular_misc_stat(
                        k, [info[k] for info in traj_infos])

        if self._opt_infos:
            for k, v in self._opt_infos.items():
                logger.record_tabular_misc_stat(k, v)
        self._opt_infos = {k: list() for k in self._opt_infos}  # (reset)

        if self._layerwise_stats:
            for name, param, init_val in zip(self._param_names, self._params,
                                             self._init_params_values):
                new_val = param.get_value()
                diff = new_val - init_val
                logger.record_tabular(name + "_Norm",
                                      np.sqrt(np.sum(new_val**2)))
                logger.record_tabular(name + "_NormFromInit",
                                      np.sqrt(np.sum(diff**2)))
        new_param_vector = self.policy.get_param_values()
        logger.record_tabular("ParamsNorm",
                              np.sqrt(np.sum(new_param_vector**2)))
        params_diff = new_param_vector - self._initial_param_vector
        logger.record_tabular("NormFromInit", np.sqrt(np.sum(params_diff**2)))
Ejemplo n.º 2
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths
     # we need to log the purely gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [
             sum(path['env_infos']['outer_rew']) for path in paths
         ]
         logger.record_tabular_misc_stat('Return',
                                         gather_undiscounted_returns,
                                         placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
         #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys(
         ) and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean(
                 [np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn',
                                   wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(
             stripped_paths
         )  # see swimmer_env.py for a scketch of the maze plotting!
 def log_diagnostics(self, paths, prefix=''):
     progs = [
         np.linalg.norm(path["env_infos"]["com"][-1] -
                        path["env_infos"]["com"][0]) for path in paths
     ]
     logger.record_tabular_misc_stat('Progress', progs)
     self.plot_visitations(paths, visit_prefix=prefix)
    def log_diagnostics(self, paths):
        progs = [
            path["observations"][-1][-3] - path["observations"][0][-3]
            for path in paths
        ]
        logger.record_tabular_misc_stat('Progress', progs, 'front')

        largest_positive_prog = max(0, np.max(progs))
        largest_negative_prog = min(0, np.min(progs))
        if abs(largest_negative_prog) > 10e-8 and abs(
                largest_positive_prog) > 10e-8:
            bimod_ratio = min(
                abs(largest_negative_prog / largest_positive_prog),
                abs(largest_positive_prog / largest_negative_prog))
        else:
            bimod_ratio = 0
        logger.record_tabular('BimodalityProgress', bimod_ratio)
Ejemplo n.º 5
0
 def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs):
     # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths
     # we need to log the purely gather reward!!
     with logger.tabular_prefix(log_prefix + '_'):
         gather_undiscounted_returns = [sum(path['env_infos']['outer_rew']) for path in paths]
         logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front')
     stripped_paths = []
     for path in paths:
         stripped_path = {}
         for k, v in path.items():
             stripped_path[k] = v
         stripped_path['observations'] = \
             stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
         #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
         stripped_paths.append(stripped_path)
     with logger.tabular_prefix('wrapped_'):
         if 'env_infos' in paths[0].keys() and 'inner_rew' in paths[0]['env_infos'].keys():
             wrapped_undiscounted_return = np.mean([np.sum(path['env_infos']['inner_rew']) for path in paths])
             logger.record_tabular('AverageReturn', wrapped_undiscounted_return)
         self.wrapped_env.log_diagnostics(stripped_paths)  # see swimmer_env.py for a scketch of the maze plotting!
    def log_diagnostics(self, paths, *args, **kwargs):
        # we call here any logging related to the maze, strip the maze obs and call log_diag with the stripped paths
        # we need to log the purely gather reward!!
        with logger.tabular_prefix('Maze_'):
            gather_undiscounted_returns = [
                sum(path['env_infos']['outer_rew']) for path in paths
            ]
            logger.record_tabular_misc_stat('Return',
                                            gather_undiscounted_returns,
                                            placement='front')
        stripped_paths = []
        for path in paths:
            stripped_path = {}
            for k, v in path.items():
                # print("k", k)
                stripped_path[k] = v
            # for k, v in path["agent_infos"].items():
            #     print("k", k)
            # print("latents", stripped_path["agent_infos"]["latents"])
            # print("latents", stripped_path["agent_infos"]["latents"].shape)
            # print("shape_len", len(stripped_path['observations'].shape))
            # print("after_con", np.concatenate(stripped_path['observations']).shape)

            if len(stripped_path['observations'].shape) == 1:
                stripped_path['observations'] = np.concatenate(
                    stripped_path['observations'])

            stripped_path['observations'] = \
                stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim]
            #  this breaks if the obs of the robot are d>1 dimensional (not a vector)
            stripped_paths.append(stripped_path)
        with logger.tabular_prefix('wrapped_'):
            wrapped_undiscounted_return = np.mean(
                [np.sum(path['env_infos']['inner_rew']) for path in paths])
            # for _ in range(10):
            #     print('OK!')
            # print(wrapped_undiscounted_return)
            # print([np.sum(path['env_infos']['inner_rew']) for path in paths])
            logger.record_tabular('SuccessRate', wrapped_undiscounted_return)
            self.wrapped_env.log_diagnostics(stripped_paths, *args, **kwargs)
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if len(paths) > 0 and "vf" in paths[0]["agent_infos"]:
            all_path_baselines = [
                p["agent_infos"]["vf"].flatten() for p in paths
            ]
        else:
            if hasattr(self.algo.baseline, "predict_n"):
                all_path_baselines = self.algo.baseline.predict_n(paths)
            else:
                all_path_baselines = [
                    self.algo.baseline.predict(path) for path in paths
                ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular_misc_stat('TrajLen',
                                        [len(p["rewards"]) for p in paths],
                                        placement='front')
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular_misc_stat('Return',
                                        undiscounted_returns,
                                        placement='front')

        return samples_data
Ejemplo n.º 8
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        n = len(paths[0]["rewards"])
        for i in range(n):
            baselines.append([])
            returns.append([])

        if len(paths) > 0 and "vf" in paths[0]["agent_infos"]:
            all_path_baselines = [
                p["agent_infos"]["vf"].flatten() for p in paths
            ]
        else:
            if hasattr(self.algo.baseline, "predict_n"):
                raise NotImplementedError
            else:
                all_path_baselines = [[
                    self.algo.NPOs[i].baseline.predict(path, idx=i)
                    for i in range(n)
                ] for path in paths]

        for idx, path in enumerate(paths):
            path["advantages"] = []
            path["returns"] = []
            for i in range(n):
                path_baselines = np.append(all_path_baselines[idx][i], 0)
                deltas = path["rewards"][i] + \
                         self.algo.discount * path_baselines[1:] - \
                         path_baselines[:-1]
                path["advantages"].append(
                    special.discount_cumsum(
                        deltas, self.algo.discount * self.algo.gae_lambda))
                path["returns"].append(
                    special.discount_cumsum(path["rewards"][i],
                                            self.algo.discount))
                baselines[i].append(path_baselines[:-1])
                returns[i].append(path["returns"][i])

        ev = [
            special.explained_variance_1d(np.concatenate(baselines[i]),
                                          np.concatenate(returns[i]))
            for i in range(n)
        ]

        if not self.algo.policy.recurrent:
            tensor_concat = lambda key: [
                tensor_utils.concat_tensor_list(x)
                for x in regroup([path[key] for path in paths])
            ]
            tensor_concat_d = lambda key: [
                tensor_utils.concat_tensor_dict_list(x)
                for x in regroup([path[key] for path in paths])
            ]

            observations_n = tensor_concat("observations")
            actions_n = tensor_concat("actions")
            rewards_n = tensor_concat("rewards")
            returns_n = tensor_concat("returns")
            advantages_n = tensor_concat("advantages")
            # env_infos_n = tensor_concat_d("env_infos")
            agent_infos_n = tensor_concat_d("agent_infos")

            # TODO(cathywu) make consistent with the rest (above)?
            # env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])

            if self.algo.center_adv:
                advantages_n = [
                    util.center_advantages(advantages)
                    for advantages in advantages_n
                ]

            if self.algo.positive_adv:
                advantages_n = [
                    util.shift_advantages_to_positive(advantages)
                    for advantages in advantages_n
                ]

            average_discounted_return = \
                np.mean([sum(path["returns"][i][0] for i in range(n)) for path in paths])

            undiscounted_returns = [
                sum(sum(path["rewards"])) for path in paths
            ]

            ent = np.mean(
                self.algo.policy.get_distribution(idx=0).entropy(
                    agent_infos_n[0]))

            samples_data_n = [
                dict(
                    observations=observations_n[i],
                    actions=actions_n[i],
                    rewards=rewards_n[i],
                    returns=returns_n[i],
                    advantages=advantages_n[i],
                    # env_infos=env_infos,
                    agent_infos=agent_infos_n[i],
                    # paths=paths,
                ) for i in range(n)
            ]
        else:
            return NotImplementedError

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            raise NotImplementedError
        else:
            for idx in range(len(self.algo.NPOs)):
                self.algo.NPOs[idx].baseline.fit(paths, idx=idx)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        for i in range(len(ev)):
            logger.record_tabular('ExplainedVariance-k%d' % i, ev[i])
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular_misc_stat('TrajLen',
                                        [len(p["rewards"][0]) for p in paths],
                                        placement='front')
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular_misc_stat('Return',
                                        undiscounted_returns,
                                        placement='front')

        return samples_data_n