Exemple #1
0
    def create_samples_dict(self, paths):

        if self.algo.safety_constraint:
            if self.use_safety_bonus:
                safety_key = 'safety_robust' + self.algo.safety_key[6:]
            else:
                safety_key = self.algo.safety_key

            logger.log("Policy optimization is using safety_key=%s." % safety_key)

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])
            weights = tensor_utils.concat_tensor_list([path["weights"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                weights=weights,
                paths=paths,
            )

            if self.algo.safety_constraint:

                safety_vals = tensor_utils.concat_tensor_list([path[safety_key] for path in paths])
                samples_data['safety_values'] = safety_vals     # for gradient calculation
                if self.algo.center_safety_vals:
                    samples_data['safety_offset'] = np.mean(safety_vals)
                    samples_data['safety_values'] = samples_data['safety_values'] - samples_data['safety_offset']

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            weights = [path["weights"] for path in paths]
            weights = tensor_utils.pad_tensor_n(weights, max_path_length)

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                weights=weights,
                paths=paths,
            )


            if self.algo.safety_constraint:

                safety_vals = [path[safety_key] for path in paths]
                if self.algo.center_safety_vals:
                    samples_data['safety_offset'] = np.mean(safety_vals)
                    safety_vals = safety_vals - samples_data['safety_offset']
                safety_vals = tensor_utils.pad_tensor_n(safety_vals, max_path_length)
                samples_data['safety_values'] = safety_vals

        if self.algo.safety_constraint:
            # logic currently only supports linearization constant calculated on most recent batch of data
            # because importance sampling is complicated
            if self.algo.safety_key == 'safety_rewards':
                if self.use_safety_bonus:
                    key = 'safety_robust_rewards'
                else:
                    key = 'safety_rewards'
                safety_eval = np.mean(tensor_utils.concat_tensor_list(
                                    [path[key] for path in self.experience_replay[-1]]
                                ))
            else:
                if self.use_safety_bonus:
                    key = 'safety_robust_returns'
                else:
                    key = 'safety_returns'
                safety_eval = np.mean(
                                    [path[key][0] for path in self.experience_replay[-1]]
                                )
            samples_data['safety_eval'] = safety_eval       # linearization constant
            samples_data['safety_rescale'] = len(samples_data['safety_values']) / sum([len(paths) for paths in self.experience_replay])


        return samples_data
Exemple #2
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.algo.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #3
0
    def process_samples(self,
                        itr,
                        paths,
                        prefix='',
                        log=True,
                        fast_process=False,
                        testitr=False,
                        metalearn_baseline=False,
                        comet_logger=None):
        baselines = []
        returns = []
        if testitr:
            metalearn_baseline = False
        train_baseline = (itr in BASELINE_TRAINING_ITRS)
        if not fast_process:
            for idx, path in enumerate(paths):
                path["returns"] = special.discount_cumsum(
                    path["rewards"], self.algo.discount)
        if not fast_process and not metalearn_baseline:
            if log:
                logger.log("fitting baseline...")
            if hasattr(self.algo.baseline, 'fit_with_samples'):
                self.algo.baseline.fit_with_samples(
                    paths,
                    samples_data)  # TODO: doesn't seem like this is ever used
            else:
                # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1])
                # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))

                self.algo.baseline.fit(paths, log=log)
                # print("debug25 predloss AFTER  fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths]))
                # print("debug22 returns                ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1])
                # print("debug24 baseline after  fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1])
            if log:
                logger.log("fitted")

            if 'switch_to_init_dist' in dir(self.algo.baseline):
                self.algo.baseline.switch_to_init_dist()

            if train_baseline:
                self.algo.baseline.fit_train_baseline(paths)

            if hasattr(self.algo.baseline, "predict_n"):
                all_path_baselines = self.algo.baseline.predict_n(paths)
            else:
                all_path_baselines = [
                    self.algo.baseline.predict(path) for path in paths
                ]

        for idx, path in enumerate(paths):
            if not fast_process and not metalearn_baseline:
                # if idx==0:
                # print("debug22", all_path_baselines[idx])
                # print("debug23", path['returns'])

                path_baselines = np.append(all_path_baselines[idx], 0)
                deltas = path["rewards"] + \
                         self.algo.discount * path_baselines[1:] - \
                         path_baselines[:-1]
                path["advantages"] = special.discount_cumsum(
                    deltas, self.algo.discount * self.algo.gae_lambda)
                baselines.append(path_baselines[:-1])
            if not fast_process:
                returns.append(path["returns"])
            if "expert_actions" not in path.keys():
                if "expert_actions" in path["env_infos"].keys():
                    path["expert_actions"] = path["env_infos"][
                        "expert_actions"]
                else:
                    # assert False, "you shouldn't need expert_actions"
                    path["expert_actions"] = np.array(
                        [[None] * len(path['actions'][0])] *
                        len(path['actions']))

        if not fast_process and not metalearn_baseline:  # TODO: we want the ev eventually
            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))
            l2 = np.linalg.norm(np.array(baselines) - np.array(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])

            if not fast_process:
                rewards = tensor_utils.concat_tensor_list(
                    [path["rewards"] for path in paths])
                returns = tensor_utils.concat_tensor_list(
                    [path["returns"] for path in paths])

            if "env_infos" in paths[0].keys():
                env_infos = tensor_utils.concat_tensor_dict_list(
                    [path["env_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                advantages = tensor_utils.concat_tensor_list(
                    [path["advantages"] for path in paths])
                # print("debug, advantages are", advantages,)
                # print("debug, shape of advantages is", type(advantages), np.shape(advantages))

            expert_actions = tensor_utils.concat_tensor_list(
                [path["expert_actions"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if not fast_process and not metalearn_baseline:
                if self.algo.center_adv:
                    advantages = util.center_advantages(advantages)
                if self.algo.positive_adv:
                    advantages = util.shift_advantages_to_positive(advantages)
                if "meta_predict" in dir(self.algo.baseline):
                    # print("debug, advantages are", advantages, )
                    advantages = advantages + self.algo.baseline.meta_predict(
                        observations)
                    print("debug, metalearned baseline constant is",
                          self.algo.baseline.meta_predict(observations)[0:2],
                          "...",
                          self.algo.baseline.meta_predict(observations)[-3:-1])
                    # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations)))
                # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1])
                # print("debug, advantages shape is", np.shape(advantages))

            # average_discounted_return = \
            #     np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path.get("rewards", [0])) for path in paths
            ]

            # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))
            if fast_process:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
            elif metalearn_baseline:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list(
                        [path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig
            else:
                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    returns=returns,
                    advantages=advantages,
                    env_infos=env_infos,
                    agent_infos=agent_infos,
                    paths=paths,
                    expert_actions=expert_actions,
                )
                if 'agent_infos_orig' in paths[0].keys():
                    agent_infos_orig = tensor_utils.concat_tensor_dict_list(
                        [path["agent_infos_orig"] for path in paths])
                    samples_data["agent_infos_orig"] = agent_infos_orig

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path.get("rewards", [0])) for path in paths
            ]

            # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        if log and comet_logger:
            comet_logger.log_metric('StdReturn', np.std(undiscounted_returns))
            comet_logger.log_metric('MaxReturn', np.max(undiscounted_returns))
            comet_logger.log_metric('MinReturn', np.min(undiscounted_returns))
            comet_logger.log_metric('AverageReturn',
                                    np.mean(undiscounted_returns))
        if log:
            # logger.record_tabular('Iteration', itr)
            # logger.record_tabular('AverageDiscountedReturn',
            #                      average_discounted_return)
            logger.record_tabular(prefix + 'AverageReturn',
                                  np.mean(undiscounted_returns))
            if testitr and prefix == "1":  # TODO make this functional for more than 1 iteration
                self.memory["AverageReturnLastTest"] = np.mean(
                    undiscounted_returns)
                self.memory["AverageReturnBestTest"] = max(
                    self.memory["AverageReturnLastTest"],
                    self.memory["AverageReturnBestTest"])
                if self.memory["AverageReturnBestTest"] == 0.0:
                    self.memory["AverageReturnBestTest"] = self.memory[
                        "AverageReturnLastTest"]
            if not fast_process and not metalearn_baseline:
                logger.record_tabular(prefix + 'ExplainedVariance', ev)
                logger.record_tabular(prefix + 'BaselinePredLoss', l2)
                if comet_logger:
                    comet_logger.log_metric('ExplainedVariance', ev)
                    comet_logger.log_metric('BaselinePredLoss', l2)
                # if comet_logger:
                #     comet_logger.log_metric('ExplainedVariance', ev)
                #     comet_logger.log_metric('BaselinePredLoss', l2)

            logger.record_tabular(prefix + 'NumTrajs', len(paths))
            # logger.record_tabular(prefix + 'Entropy', ent)
            # logger.record_tabular(prefix + 'Perplexity', np.exp(ent))
            logger.record_tabular(prefix + 'StdReturn',
                                  np.std(undiscounted_returns))
            logger.record_tabular(prefix + 'MaxReturn',
                                  np.max(undiscounted_returns))
            logger.record_tabular(prefix + 'MinReturn',
                                  np.min(undiscounted_returns))
            if "env_infos" in paths[0].keys(
            ) and "success_left" in paths[0]["env_infos"].keys():
                logger.record_tabular(prefix + 'success_left',
                                      eval_success_left(paths))
                logger.record_tabular(prefix + 'success_right',
                                      eval_success_right(paths))
                if comet_logger:
                    comet_logger.log_metric('success_left',
                                            eval_success_left(paths))
                    comet_logger.log_metric('success_right',
                                            eval_success_right(paths))
            # else:
            # logger.record_tabular(prefix + 'success_left', -1.0)
            # logger.record_tabular(prefix + 'success_right', -1.0)
        # if metalearn_baseline:
        #     if hasattr(self.algo.baseline, "revert"):
        #         self.algo.baseline.revert()

        return samples_data
Exemple #4
0
    def optimize_policy(self, itr, samples_data):
        # Init vars
        rewards = samples_data['rewards']
        actions = samples_data['actions']
        observations = samples_data['observations']

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
        if self.policy.recurrent:
            recurrent_vals = [samples_data["valids"]]
        else:
            recurrent_vals = []
        # Compute sample Bellman error.
        feat_diff = []
        for path in samples_data['paths']:
            feats = self._features(path)
            feats = np.vstack([feats, np.zeros(feats.shape[1])])
            feat_diff.append(feats[1:] - feats[:-1])
        if self.policy.recurrent:
            max_path_length = max([len(path["advantages"]) for path in samples_data["paths"]])
            # pad feature diffs
            feat_diff = np.array([tensor_utils.pad_tensor(fd, max_path_length) for fd in feat_diff])
        else:
            feat_diff = np.vstack(feat_diff)

        #################
        # Optimize dual #
        #################

        # Here we need to optimize dual through BFGS in order to obtain \eta
        # value. Initialize dual function g(\theta, v). \eta > 0
        # First eval delta_v
        f_dual = self.opt_info['f_dual']
        f_dual_grad = self.opt_info['f_dual_grad']

        # Set BFGS eval function
        def eval_dual(input):
            param_eta = input[0]
            param_v = input[1:]
            val = f_dual(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v]))
            return val.astype(np.float64)

        # Set BFGS gradient eval function
        def eval_dual_grad(input):
            param_eta = input[0]
            param_v = input[1:]
            grad = f_dual_grad(*([rewards, feat_diff] + state_info_list + recurrent_vals + [param_eta, param_v]))
            eta_grad = np.float(grad[0])
            v_grad = grad[1]
            return np.hstack([eta_grad, v_grad])

        # Initial BFGS parameter values.
        x0 = np.hstack([self.param_eta, self.param_v])

        # Set parameter boundaries: \eta>0, v unrestricted.
        bounds = [(-np.inf, np.inf) for _ in x0]
        bounds[0] = (0., np.inf)

        # Optimize through BFGS
        logger.log('optimizing dual')
        eta_before = x0[0]
        dual_before = eval_dual(x0)
        params_ast, _, _ = self.optimizer(
            func=eval_dual, x0=x0,
            fprime=eval_dual_grad,
            bounds=bounds,
            maxiter=self.max_opt_itr,
            disp=0
        )
        dual_after = eval_dual(params_ast)

        # Optimal values have been obtained
        self.param_eta = params_ast[0]
        self.param_v = params_ast[1:]

        ###################
        # Optimize policy #
        ###################
        cur_params = self.policy.get_param_values(trainable=True)
        f_loss = self.opt_info["f_loss"]
        f_loss_grad = self.opt_info['f_loss_grad']
        input = [rewards, observations, feat_diff,
                 actions] + state_info_list + recurrent_vals + [self.param_eta, self.param_v]

        # Set loss eval function
        def eval_loss(params):
            self.policy.set_param_values(params, trainable=True)
            val = f_loss(*input)
            return val.astype(np.float64)

        # Set loss gradient eval function
        def eval_loss_grad(params):
            self.policy.set_param_values(params, trainable=True)
            grad = f_loss_grad(*input)
            flattened_grad = tensor_utils.flatten_tensors(list(map(np.asarray, grad)))
            return flattened_grad.astype(np.float64)

        loss_before = eval_loss(cur_params)
        logger.log('optimizing policy')
        params_ast, _, _ = self.optimizer(
            func=eval_loss, x0=cur_params,
            fprime=eval_loss_grad,
            disp=0,
            maxiter=self.max_opt_itr
        )
        loss_after = eval_loss(params_ast)

        f_kl = self.opt_info['f_kl']

        mean_kl = f_kl(*([observations, actions] + state_info_list + dist_info_list + recurrent_vals)).astype(
            np.float64)

        logger.log('eta %f -> %f' % (eta_before, self.param_eta))

        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)
        logger.record_tabular('DualBefore', dual_before)
        logger.record_tabular('DualAfter', dual_after)
        logger.record_tabular('MeanKL', mean_kl)
Exemple #5
0
    def process_samples(self, itr, paths):
        if self.algo.ma_mode == 'centralized':
            return super().process_samples(itr, paths)
        elif self.algo.ma_mode == 'decentralized':
            return super().process_samples(
                itr, list(itertools.chain.from_iterable(paths)))
        elif self.algo.ma_mode == 'concurrent':
            processed_samples = []
            for ps, policy, baseline in zip(paths, self.algo.policies,
                                            self.algo.baselines):

                baselines = []
                returns = []

                if hasattr(baseline, "predict_n"):
                    all_path_baselines = baseline.predict_n(ps)
                else:
                    all_path_baselines = [
                        baseline.predict(path) for path in ps
                    ]

                for idx, path in enumerate(ps):
                    path_baselines = np.append(all_path_baselines[idx], 0)
                    deltas = path["rewards"] + \
                             self.algo.discount * path_baselines[1:] - \
                             path_baselines[:-1]
                    path["advantages"] = special.discount_cumsum(
                        deltas, self.algo.discount * self.algo.gae_lambda)
                    path["returns"] = special.discount_cumsum(
                        path["rewards"], self.algo.discount)
                    baselines.append(path_baselines[:-1])
                    returns.append(path["returns"])

                ev = special.explained_variance_1d(np.concatenate(baselines),
                                                   np.concatenate(returns))

                if not policy.recurrent:
                    observations = tensor_utils.concat_tensor_list(
                        [path["observations"] for path in ps])
                    actions = tensor_utils.concat_tensor_list(
                        [path["actions"] for path in ps])
                    rewards = tensor_utils.concat_tensor_list(
                        [path["rewards"] for path in ps])
                    returns = tensor_utils.concat_tensor_list(
                        [path["returns"] for path in ps])
                    advantages = tensor_utils.concat_tensor_list(
                        [path["advantages"] for path in ps])
                    env_infos = tensor_utils.concat_tensor_dict_list(
                        [path["env_infos"] for path in ps])
                    agent_infos = tensor_utils.concat_tensor_dict_list(
                        [path["agent_infos"] for path in ps])

                    if self.algo.center_adv:
                        advantages = util.center_advantages(advantages)

                    if self.algo.positive_adv:
                        advantages = util.shift_advantages_to_positive(
                            advantages)

                    average_discounted_return = \
                                                np.mean([path["returns"][0] for path in ps])

                    undiscounted_returns = [
                        sum(path["rewards"]) for path in ps
                    ]

                    ent = np.mean(policy.distribution.entropy(agent_infos))

                    samples_data = dict(
                        observations=observations,
                        actions=actions,
                        rewards=rewards,
                        returns=returns,
                        advantages=advantages,
                        env_infos=env_infos,
                        agent_infos=agent_infos,
                        ps=ps,
                    )
                else:
                    max_path_length = max(
                        [len(path["advantages"]) for path in ps])

                    # make all ps the same length (pad extra advantages with 0)
                    obs = [path["observations"] for path in ps]
                    obs = tensor_utils.pad_tensor_n(obs, max_path_length)

                    if self.algo.center_adv:
                        raw_adv = np.concatenate(
                            [path["advantages"] for path in ps])
                        adv_mean = np.mean(raw_adv)
                        adv_std = np.std(raw_adv) + 1e-8
                        adv = [(path["advantages"] - adv_mean) / adv_std
                               for path in ps]
                    else:
                        adv = [path["advantages"] for path in ps]

                    adv = np.asarray([
                        tensor_utils.pad_tensor(a, max_path_length)
                        for a in adv
                    ])

                    actions = [path["actions"] for path in ps]
                    actions = tensor_utils.pad_tensor_n(
                        actions, max_path_length)

                    rewards = [path["rewards"] for path in ps]
                    rewards = tensor_utils.pad_tensor_n(
                        rewards, max_path_length)

                    returns = [path["returns"] for path in ps]
                    returns = tensor_utils.pad_tensor_n(
                        returns, max_path_length)

                    agent_infos = [path["agent_infos"] for path in ps]
                    agent_infos = tensor_utils.stack_tensor_dict_list([
                        tensor_utils.pad_tensor_dict(p, max_path_length)
                        for p in agent_infos
                    ])

                    env_infos = [path["env_infos"] for path in ps]
                    env_infos = tensor_utils.stack_tensor_dict_list([
                        tensor_utils.pad_tensor_dict(p, max_path_length)
                        for p in env_infos
                    ])

                    valids = [np.ones_like(path["returns"]) for path in ps]
                    valids = tensor_utils.pad_tensor_n(valids, max_path_length)

                    average_discounted_return = \
                                                np.mean([path["returns"][0] for path in ps])

                    undiscounted_returns = [
                        sum(path["rewards"]) for path in ps
                    ]

                    ent = np.sum(
                        policy.distribution.entropy(agent_infos) *
                        valids) / np.sum(valids)

                    samples_data = dict(
                        observations=obs,
                        actions=actions,
                        advantages=adv,
                        rewards=rewards,
                        returns=returns,
                        valids=valids,
                        agent_infos=agent_infos,
                        env_infos=env_infos,
                        ps=ps,
                    )

                logger.log("fitting baseline...")
                if hasattr(baseline, 'fit_with_samples'):
                    baseline.fit_with_samples(ps, samples_data)
                else:
                    baseline.fit(ps)
                    logger.log("fitted")

                logger.record_tabular('Iteration', itr)
                logger.record_tabular('AverageDiscountedReturn',
                                      average_discounted_return)
                logger.record_tabular('AverageReturn',
                                      np.mean(undiscounted_returns))
                logger.record_tabular('ExplainedVariance', ev)
                logger.record_tabular('NumTrajs', len(ps))
                logger.record_tabular('Entropy', ent)
                logger.record_tabular('Perplexity', np.exp(ent))
                logger.record_tabular('StdReturn',
                                      np.std(undiscounted_returns))
                logger.record_tabular('MaxReturn',
                                      np.max(undiscounted_returns))
                logger.record_tabular('MinReturn',
                                      np.min(undiscounted_returns))

                processed_samples.append(samples_data)

            return processed_samples
Exemple #6
0
    def process_samples(self, paths):
        baselines = []
        returns = []
        discount = self.algo.discount
        tmax = self.algo.tmax
        for path in paths:
            path_baselines = np.append(self.algo.baseline.predict(path), 0)
            path_len = len(path["rewards"])
            if tmax < 0:
                deltas = path["rewards"] + \
                         self.algo.discount * path_baselines[1:] - \
                         path_baselines[:-1]
                path["advantages"] = special.discount_cumsum(
                    deltas, discount * self.algo.gae_lambda)
            else:
                assert self.algo.gae_lambda == 1.0
                path["advantages"] = np.zeros(path_len)
                for t1 in range(path_len):
                    t2 = t1 + tmax
                    if t2 < path_len:
                        path["advantages"][t1] = np.sum(
                            path["rewards"][t1:t2 + 1] *
                            (discount**np.asarray(range(tmax + 1)))
                        ) - path_baselines[t1] + path_baselines[t2] * (discount
                                                                       **tmax)
                    else:
                        path["advantages"][t1] = np.sum(
                            path["rewards"][t1:] * (discount**np.asarray(
                                range(path_len - t1)))) - path_baselines[t1]

            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        dgnstc_data = dict(baselines=baselines, returns=returns)

        if not self.algo.policy.recurrent:
            if self.avoid_duplicate_paths:
                raise NotImplementedError
                # tensor_list = ["observations","actions","raw_rewards","advantages","returns"]
                # dict_list = ["env_infos", "agent_infos"]
                # retain_list = ["returns","rewards","raw_rewards","bonus_rewards"]
                # samples_data = dict()
                # samples_data["paths"] = []
                #
                # # reversely load path data into samples_data and delete the loaded path
                # for i in range(len(paths)-1, -1, -1):
                #     path = paths[i]
                #     if i == len(paths) - 1:
                #         for data in tensor_list + dict_list:
                #             samples_data[data] = copy.deepcopy(path[data])
                #     else:
                #         for data in tensor_list:
                #             samples_data[data] = tensor_utils.concat_tensor_list([
                #                 copy.deepcopy(path[data]),
                #                 samples_data[data],
                #             ])
                #         for data in dict_list:
                #             samples_data[data] = tensor_utils.concat_tensor_dict_list([
                #                 copy.deepcopy(path[data]),
                #                 samples_data[data],
                #             ])
                #     new_path = {
                #         data: copy.deepcopy(path[data])
                #         for data in retain_list
                #     }
                #     samples_data["paths"] = [new_path] + samples_data["paths"]
                #
                #     # del paths[i]
                # if self.algo.center_adv:
                #     samples_data["advantages"] = util.center_advantages(samples_data["advantages"])
                #
                # if self.algo.positive_adv:
                #     samples_data["advantages"] = util.shift_advantages_to_positive(samples_data["advantages"])
            else:
                observations = tensor_utils.concat_tensor_list(
                    [path["observations"] for path in paths])
                actions = tensor_utils.concat_tensor_list(
                    [path["actions"] for path in paths])
                rewards = tensor_utils.concat_tensor_list(
                    [path["rewards"] for path in paths])
                raw_rewards = tensor_utils.concat_tensor_list(
                    [path["raw_rewards"] for path in paths])
                returns = tensor_utils.concat_tensor_list(
                    [path["returns"] for path in paths])
                advantages = tensor_utils.concat_tensor_list(
                    [path["advantages"] for path in paths])
                env_infos = tensor_utils.concat_tensor_dict_list(
                    [path["env_infos"] for path in paths])
                agent_infos = tensor_utils.concat_tensor_dict_list(
                    [path["agent_infos"] for path in paths])

                if self.algo.center_adv:
                    advantages = util.center_advantages(advantages)

                if self.algo.positive_adv:
                    advantages = util.shift_advantages_to_positive(advantages)

                # NOTE: Removed diagnostics calculation to batch_polopt.

                samples_data = dict(
                    observations=observations,
                    actions=actions,
                    rewards=rewards,
                    raw_rewards=raw_rewards,
                    returns=returns,
                    advantages=advantages,
                    env_infos=env_infos,
                    agent_infos=agent_infos,
                    paths=paths,
                )
        else:
            if self.avoid_duplicate_paths:
                raise NotImplementedError

            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])
            raw_rewards = [path["raw_rewards"] for path in paths]
            raw_rewards = np.array([
                tensor_utils.pad_tensor(r, max_path_length)
                for r in raw_rewards
            ])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            # NOTE: Removed diagnostics calculation to batch_polopt.

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                raw_rewards=raw_rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        # NOTE: Removed baseline fitting to batch_polopt.
        return samples_data, dgnstc_data
Exemple #7
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        for idx, path in enumerate(paths):
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            path["advantages"] = path['returns']

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def process_samples(self, itr, paths):

        if self.normalize_reward:
            # Update reward mean/std Q.
            rewards = []
            for i in range(len(paths)):
                rewards.append(paths[i]['rewards'])
            rewards_flat = np.hstack(rewards)
            self._reward_mean.append(np.mean(rewards_flat))
            self._reward_std.append(np.std(rewards_flat))

            # Normalize rewards.
            reward_mean = np.mean(np.asarray(self._reward_mean))
            reward_std = np.mean(np.asarray(self._reward_std))
            for i in range(len(paths)):
                paths[i]['rewards'] = (paths[i]['rewards'] -
                                       reward_mean) / (reward_std + 1e-8)

        if itr > 0:
            kls = []
            for i in range(len(paths)):
                kls.append(paths[i]['KL'])

            kls_flat = np.hstack(kls)

            logger.record_tabular('Expl_MeanKL', np.mean(kls_flat))
            logger.record_tabular('Expl_StdKL', np.std(kls_flat))
            logger.record_tabular('Expl_MinKL', np.min(kls_flat))
            logger.record_tabular('Expl_MaxKL', np.max(kls_flat))

            # Perform normlization of the intrinsic rewards.
            if self.use_kl_ratio:
                if self.use_kl_ratio_q:
                    # Update kl Q
                    self.kl_previous.append(np.median(np.hstack(kls)))
                    previous_mean_kl = np.mean(np.asarray(self.kl_previous))
                    for i in range(len(kls)):
                        kls[i] = kls[i] / previous_mean_kl

            # Add KL ass intrinsic reward to external reward
            for i in range(len(paths)):
                paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i]

            # Discount eta
            self.eta *= self.eta_discount

        else:
            logger.record_tabular('Expl_MeanKL', 0.)
            logger.record_tabular('Expl_StdKL', 0.)
            logger.record_tabular('Expl_MinKL', 0.)
            logger.record_tabular('Expl_MaxKL', 0.)

        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                self.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards_orig"],
                                                      self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.center_adv:
                advantages = util.center_advantages(advantages)

            if self.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path["rewards_orig"]) for path in paths
            ]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #9
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        #---------------Updated by myself---------------
        violation_cost = []
        boundary_violation_cost = []
        succ_rate = 0
        succ_return = []
        # log_liklihood=[]

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])
            #-------- Updated by myself----------------
            violation_cost.append(path["violation_cost"])
            boundary_violation_cost.append(path["boundary_violation_cost"])
            # log_liklihood.append(path["log_likelihood"])
            succ_rate += path["succ_rate"]
            if not (path["succ_return"] == 0):
                succ_return.append(path["succ_return"])

        succ_rate = succ_rate / len(paths)

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
                violation_cost=np.array(violation_cost),
                boundary_violation_cost=np.array(boundary_violation_cost),
                success_rate=succ_rate,
                successful_AverageReturn=np.array(succ_return),
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
                violation_cost=np.array(violation_cost),
                boundary_violation_cost=np.array(boundary_violation_cost),
                success_rate=succ_rate,
                successful_AverageReturn=np.array(succ_return),
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))
        analysis_data = dict(Iteration=itr,
                             AverageDiscountedReturn=average_discounted_return,
                             AverageReturn=np.mean(undiscounted_returns),
                             ExplainedVariance=ev,
                             NumTrajs=len(paths),
                             Entropy=ent,
                             Perplexity=np.exp(ent),
                             StdReturn=np.std(undiscounted_returns),
                             MaxReturn=np.max(undiscounted_returns),
                             MinReturn=np.min(undiscounted_returns))

        return samples_data, analysis_data
Exemple #10
0
    def optimize_policy(self, itr, samples_data):
        # Init vars
        rewards = samples_data['rewards']
        actions = samples_data['actions']
        observations = samples_data['observations']

        agent_infos = samples_data["agent_infos"]
        state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
        dist_info_list = [
            agent_infos[k] for k in self.policy.distribution.dist_info_keys
        ]
        if self.policy.recurrent:
            recurrent_vals = [samples_data["valids"]]
        else:
            recurrent_vals = []
        # Compute sample Bellman error.
        feat_diff = []
        for path in samples_data['paths']:
            feats = self._features(path)
            feats = np.vstack([feats, np.zeros(feats.shape[1])])
            feat_diff.append(feats[1:] - feats[:-1])
        if self.policy.recurrent:
            max_path_length = max(
                [len(path["advantages"]) for path in samples_data["paths"]])
            # pad feature diffs
            feat_diff = np.array([
                tensor_utils.pad_tensor(fd, max_path_length)
                for fd in feat_diff
            ])
        else:
            feat_diff = np.vstack(feat_diff)

        #################
        # Optimize dual #
        #################

        # Here we need to optimize dual through BFGS in order to obtain \eta
        # value. Initialize dual function g(\theta, v). \eta > 0
        # First eval delta_v
        f_dual = self.opt_info['f_dual']
        f_dual_grad = self.opt_info['f_dual_grad']

        # Set BFGS eval function
        def eval_dual(input):
            param_eta = input[0]
            param_v = input[1:]
            val = f_dual(*([rewards, feat_diff] + state_info_list +
                           recurrent_vals + [param_eta, param_v]))
            return val.astype(np.float64)

        # Set BFGS gradient eval function
        def eval_dual_grad(input):
            param_eta = input[0]
            param_v = input[1:]
            grad = f_dual_grad(*([rewards, feat_diff] + state_info_list +
                                 recurrent_vals + [param_eta, param_v]))
            eta_grad = np.float(grad[0])
            v_grad = grad[1]
            return np.hstack([eta_grad, v_grad])

        # Initial BFGS parameter values.
        x0 = np.hstack([self.param_eta, self.param_v])

        # Set parameter boundaries: \eta>0, v unrestricted.
        bounds = [(-np.inf, np.inf) for _ in x0]
        bounds[0] = (0., np.inf)

        # Optimize through BFGS
        logger.log('optimizing dual')
        eta_before = x0[0]
        dual_before = eval_dual(x0)
        params_ast, _, _ = self.optimizer(func=eval_dual,
                                          x0=x0,
                                          fprime=eval_dual_grad,
                                          bounds=bounds,
                                          maxiter=self.max_opt_itr,
                                          disp=0)
        dual_after = eval_dual(params_ast)

        # Optimal values have been obtained
        self.param_eta = params_ast[0]
        self.param_v = params_ast[1:]

        ###################
        # Optimize policy #
        ###################
        cur_params = self.policy.get_param_values(trainable=True)
        f_loss = self.opt_info["f_loss"]
        f_loss_grad = self.opt_info['f_loss_grad']
        input = [
            rewards, observations, feat_diff, actions
        ] + state_info_list + recurrent_vals + [self.param_eta, self.param_v]

        # Set loss eval function
        def eval_loss(params):
            self.policy.set_param_values(params, trainable=True)
            val = f_loss(*input)
            return val.astype(np.float64)

        # Set loss gradient eval function
        def eval_loss_grad(params):
            self.policy.set_param_values(params, trainable=True)
            grad = f_loss_grad(*input)
            flattened_grad = tensor_utils.flatten_tensors(
                list(map(np.asarray, grad)))
            return flattened_grad.astype(np.float64)

        loss_before = eval_loss(cur_params)
        logger.log('optimizing policy')
        params_ast, _, _ = self.optimizer(func=eval_loss,
                                          x0=cur_params,
                                          fprime=eval_loss_grad,
                                          disp=0,
                                          maxiter=self.max_opt_itr)
        loss_after = eval_loss(params_ast)

        f_kl = self.opt_info['f_kl']

        mean_kl = f_kl(*([observations, actions] + state_info_list +
                         dist_info_list + recurrent_vals)).astype(np.float64)

        logger.log('eta %f -> %f' % (eta_before, self.param_eta))

        logger.record_tabular("LossBefore", loss_before)
        logger.record_tabular("LossAfter", loss_after)
        logger.record_tabular('DualBefore', dual_before)
        logger.record_tabular('DualAfter', dual_after)
        logger.record_tabular('MeanKL', mean_kl)
Exemple #11
0
    def create_samples_dict(self, paths):
        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])
            weights = tensor_utils.concat_tensor_list(
                [path["weights"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                weights=weights,
                paths=paths,
            )

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            weights = [path["weights"] for path in paths]
            weights = tensor_utils.pad_tensor_n(weights, max_path_length)

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                weights=weights,
                paths=paths,
            )

        return samples_data
Exemple #12
0
    def process_samples(self, itr, paths):
        # IMPORTANT:
        # Rewards accrued from a_t to a_t+1 are expected to be discounted by
        # the environment to values at time t

        #paths = list(itertools.chain.from_iterable(paths))

        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            t_sojourn = path["offset_t_sojourn"]
            gamma = self.algo.discount
            lamda = self.algo.gae_lambda
            discount_gamma = np.exp(-gamma * t_sojourn)
            discount_gamma_lambda = np.exp(-gamma * lamda * t_sojourn)
            path_baselines = np.append(all_path_baselines[idx], 0)
            if (len(path["rewards"]) != len(t_sojourn)):
                # TODO HANDLE INFINITE HORIZON GAMES
                pdb.set_trace()
            deltas = path["rewards"] + \
               discount_gamma * path_baselines[1:] - \
               path_baselines[:-1]
            path["advantages"] = variable_discount_cumsum(
                deltas, discount_gamma_lambda)
            path["returns"] = variable_discount_cumsum(path["rewards"],
                                                       discount_gamma)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
             np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )

        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
             np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #13
0
    def process_samples(self, itr, paths, update_baseline=True):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if hasattr(self.algo, 'epopt_epsilon'):
            if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr:
                # prune the paths
                target_path_size = len(paths) * self.algo.epopt_epsilon
                sorted_indices = np.argsort(
                    [path["returns"][0] for path in paths])
                idx = 0
                si_idx = 0
                while True:
                    if sorted_indices[si_idx] > target_path_size:
                        paths.pop(idx)
                        idx -= 1
                    idx += 1
                    si_idx += 1
                    if idx >= len(paths):
                        break

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = []
            ct = 0
            for path in paths:
                if path['env_infos']['dyn_model_id'][-1] == 0:
                    undiscounted_returns.append(sum(path["rewards"]))
                if path['env_infos']['dyn_model_id'][-1] == 1:
                    ct += 1
            print('path count with fake dynamics: ', ct,
                  len(undiscounted_returns), len(paths))

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        if update_baseline:
            logger.log("fitting baseline...")
            if hasattr(self.algo.baseline, 'fit_with_samples'):
                self.algo.baseline.fit_with_samples(paths, samples_data)
            else:
                self.algo.baseline.fit(paths)
            logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #14
0
    def process_samples_skill_dependent(self, itr, paths):
        # need to generate the correct observations using the outer product
        new_paths = []
        for i in range(len(paths)):
            latents = paths[i]['agent_infos']['latents']
            observations = paths[i]['observations']
            # insert the time_remaining
            time_remaining = paths[i]['agent_infos']['time_remaining'].reshape(
                len(observations), 1)
            extended_obs = np.concatenate([observations, time_remaining],
                                          axis=1)
            # new_observations = np.matmul(observations[:, :, np.newaxis], latents[:, np.newaxis, :]).reshape(observations.shape[0], -1)
            new_observations = np.matmul(extended_obs[:, :, np.newaxis],
                                         latents[:, np.newaxis, :]).reshape(
                                             extended_obs.shape[0], -1)
            new_observations = np.concatenate(
                [new_observations, extended_obs, latents], axis=1)
            new_paths.append(
                dict(observations=new_observations,
                     rewards=paths[i]['rewards'],
                     returns=paths[i]['returns']))
        paths = new_paths

        baselines = []
        returns = []

        if hasattr(self.algo.skill_dependent_baseline, "predict_n"):
            all_path_baselines = self.algo.skill_dependent_baseline.predict_n(
                paths)
        else:
            all_path_baselines = [
                self.algo.skill_dependent_baseline.predict(path)
                for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            samples_data = dict(advantages=advantages, )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            samples_data = dict(advantages=adv, )

        logger.log("fitting skill-depdendent baseline...")
        if hasattr(self.algo.skill_dependent_baseline, 'fit_with_samples'):
            self.algo.skill_dependent_baseline.fit_with_samples(
                paths, samples_data)
        else:
            self.algo.skill_dependent_baseline.fit(paths)
        logger.log("fitted skill-dependent baseline")

        logger.record_tabular('SkillBaselineExplainedVariance', ev)
        return samples_data
Exemple #15
0
    def cgmprocess_samples(self, itr, paths):
        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.swapaxes(self.algo.baseline.predict(path), 0,
                                         1)
            env_info = path["env_infos"]
            sampleA = env_info["count"].astype(float)
            actionsTaken = sampleA.sum(axis=3)
            sampleS = sampleA.sum(axis=(2, 3))  #+ TINY
            H = sampleA.shape[0]
            immediateReward = path["rewards"]
            zeroIndices = (sampleS == 0)
            sampleS[zeroIndices] = 1
            accumulatedReward = immediateReward * actionsTaken / sampleS[:, :,
                                                                         np.
                                                                         newaxis]
            temp = accumulatedReward[H - 1].sum(axis=1)
            for t in xrange(H - 2, -1, -1):
                accumulatedReward[t] += (
                    temp[np.newaxis, np.newaxis, :] *
                    sampleA[t]).sum(axis=2) / sampleS[t, :, np.newaxis]
                temp = accumulatedReward[t].sum(axis=1)
            sampleS[zeroIndices] = 0
            path["returns"] = accumulatedReward.sum(
                axis=2
            )  #special.discount_cumsum(path["rewards"], self.algo.discount)
            path["advantages"] = accumulatedReward - path[
                "returns"][:, :, np.newaxis]  #path_baselines
            baselines.append(path_baselines)
            temp = np.max(immediateReward)
            if temp > 0:
                temp1 = 2
            returns.append(path["returns"])

        observations = tensor_utils.concat_tensor_list(
            [path["observations"] for path in paths])
        actions = tensor_utils.concat_tensor_list(
            [path["actions"] for path in paths])
        rewards = tensor_utils.concat_tensor_list(
            [path["rewards"] for path in paths])
        advantages = tensor_utils.concat_tensor_list(
            [path["advantages"] for path in paths])
        agent_infos = tensor_utils.concat_tensor_dict_list(
            [path["agent_infos"] for path in paths])
        returns = np.concatenate(returns)
        baselines = np.concatenate(baselines)
        if not self.algo.policy.recurrent:
            # if self.algo.center_adv:
            #     advantages = util.center_advantages(advantages)
            #
            # if self.algo.positive_adv:
            #     advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [np.sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.entropy(agent_infos))

            ev = 3
            #     special.explained_variance_2d(
            #     baselines,#np.array([[]]),#
            #     returns
            # )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=[],
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.algo.baseline.fit(observations, rewards, returns)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        AverageReturn = np.mean(undiscounted_returns)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        with open("result.txt", "a") as myfile:
            myfile.write("%f\n" % AverageReturn)
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def process_samples(self, itr, paths):
        if not self.initialized:
            self.initialize()
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            if hasattr(self.algo, '_kwargs'):
                if self.mode.startswith('inception'):
                    if idx % 100 == 0:
                        print("paths", idx)
                    imgs = [
                        img[0] for img in path['env_infos']['imgs']
                        if img is not None
                    ]
                    feat = self.sess.run(self.model[1][self.layer],
                                         {self.image: imgs})
                    diff = self.means - feat
                    diff[self.std == 0] = 0
                    diff = diff**2 / (self.std + 1e-5)
                    means = np.mean(diff, axis=(1, 2, 3))
                    for j in range(25):
                        path["rewards"][j * 2 + 1] -= means[j] * (j**2)
                elif self.mode == 'oracle':
                    path["rewards"] += path["env_infos"]["reward_true"]
                elif self.mode.startswith('ours'):
                    imgs = [
                        img for img in path['env_infos']['imgs']
                        if img is not None
                    ]

                    if not hasattr(self, 'means'):
                        self.means = []
                        self.imgs = []
                        validdata = np.load(self.algo._kwargs['modeldata'])
                        for vp in range(self.nvp):
                            context = imgs[0][vp]
                            timgs = []
                            tfeats = []
                            nvideos = validdata.shape[1]
                            if self.mode == 'oursinception':
                                nvideos = 50
                            for i in range(nvideos):
                                if i % 10 == 0:
                                    print("feats", i)
                                skip = 1
                                if self.name == 'real' or self.name == 'sweep':
                                    skip = 2
                                if self.mode == 'oursinception':
                                    input_img = validdata[::skip, i]
                                else:
                                    input_img = ((validdata[::skip, i] + 1) *
                                                 127.5).astype(np.uint8)
                                tfeat, timg = self.sess.run(
                                    [self.model.translated_z, self.model.out],
                                    {
                                        self.image: [
                                            input_img,
                                            [context] * self.batch_size,
                                            [context] * self.batch_size
                                        ]
                                    })
                                timgs.append(timg)
                                tfeats.append(tfeat)
                            self.means.append(np.mean(tfeats, axis=0))
                            meanimgs = np.mean(timgs, axis=0)
                            self.imgs.append(meanimgs)
                            # for j in range(25):
                            #     scipy.misc.imsave('test/%d_%d.png' %(vp, j), arm_shaping.inverse_transform(meanimgs[j]))

                    if idx % 10 == 0:
                        print("feats", idx)
                    # import IPython
                    # IPython.embed()
                    costs = 0
                    for vp in range(self.nvp):
                        curimgs = [img[vp] for img in imgs]
                        feats, image_trans = self.sess.run(
                            [self.model.input_z, self.image_trans], {
                                self.image: [
                                    curimgs, [curimgs[0]] * self.batch_size,
                                    curimgs
                                ]
                            })
                        # import IPython
                        # IPython.embed()
                        # for j in range(25):
                        #     scipy.misc.imsave('test/' + str(j) + "_recon.png", arm_shaping.inverse_transform(image_recon[j]))
                        # for j in range(25):
                        #     scipy.misc.imsave('test/' + str(j) + "_orig.png", arm_shaping.inverse_transform(image_trans[0][j]))

                        if self.ablation_type == "None":
                            costs += np.sum((self.means[vp] - feats)**2, axis = 1) + \
                                self.algo._kwargs['scale']*np.sum((self.imgs[vp] - image_trans[0])**2, axis = (1, 2, 3))
                        elif self.ablation_type == "nofeat":
                            costs = self.algo._kwargs['scale'] * np.sum(
                                (self.imgs - image_trans[0])**2,
                                axis=(1, 2, 3))
                        elif self.ablation_type == "noimage":
                            costs = np.sum((self.means - feats)**2, axis=1)
                        elif self.ablation_type == 'recon':
                            costs = np.sum((self.means - feats)**2, axis = 1) + \
                                self.algo._kwargs['scale']*np.sum((image_recon - image_trans[0])**2, axis = (1, 2, 3))
                    # costs = np.sum((self.means - feats)**2, axis = 1) + \
                    #     self.algo._kwargs['scale']*np.sum((self.imgs - image_trans[0])**2, axis = (1, 2, 3))

                    for j in range(25):
                        path["rewards"][j * 2 + 1] -= costs[j] * (j**2)

            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        if 'reward_true' in paths[0]['env_infos']:
            trues = [sum(path["env_infos"]["reward_true"]) for path in paths]
            logger.record_tabular('ReturnTrue', np.mean(trues))
            logger.record_tabular('MinTrue', np.min(trues))
            logger.record_tabular('MaxTrue', np.max(trues))
            logger.record_tabular('ArgmaxTrueReturn',
                                  trues[np.argmax(undiscounted_returns)])
        # logger.record_tabular('Shaping', np.mean([path["shaping_reward"] for path in paths]))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #17
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.algo.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [np.sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(np.concatenate(baselines),
                                               np.concatenate(returns))

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #18
0
    def process_samples(self, itr, paths):
        baselines = []
        returns = []

        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [
                self.algo.baseline.predict(path) for path in paths
            ]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            path["returns"] = special.discount_cumsum(path["rewards"],
                                                      self.algo.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        ev = special.explained_variance_1d(np.concatenate(baselines),
                                           np.concatenate(returns))

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list(
                [path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            baselines_tensor = tensor_utils.concat_tensor_list(baselines)
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])
            etas = None

            if hasattr(self.algo,
                       'qprop') and self.algo.qprop and self.algo.qprop_enable:
                old_advantages = np.copy(advantages)
                old_advantages = self.process_advantages(old_advantages)
                old_advantages_scale = np.abs(old_advantages).mean()
                logger.record_tabular("AbsLearnSignalOld",
                                      old_advantages_scale)
                logger.log("Qprop, subtracting control variate")
                advantages_bar = self.algo.get_control_variate(
                    observations=observations, actions=actions)
                if self.algo.qprop_eta_option == 'ones':
                    etas = np.ones_like(advantages)
                elif self.algo.qprop_eta_option == 'adapt1':  # conservative
                    etas = (advantages * advantages_bar) > 0
                    etas = etas.astype(advantages.dtype)
                    logger.log("Qprop, etas: %d 1s, %d 0s" %
                               ((etas == 1).sum(), (etas == 0).sum()))
                elif self.algo.qprop_eta_option == 'adapt2':  # aggressive
                    etas = np.sign(advantages * advantages_bar)
                    etas = etas.astype(advantages.dtype)
                    logger.log("Qprop, etas: %d 1s, %d -1s" %
                               ((etas == 1).sum(), (etas == -1).sum()))
                else:
                    raise NotImplementedError(self.algo.qprop_eta_option)
                advantages -= etas * advantages_bar
                advantages = self.process_advantages(advantages)
                advantages_scale = np.abs(advantages).mean()
                logger.record_tabular("AbsLearnSignalNew", advantages_scale)
                logger.record_tabular("AbsLearnSignal", advantages_scale)
            else:
                advantages = self.process_advantages(advantages)
                advantages_scale = np.abs(advantages).mean()
                logger.record_tabular("AbsLearnSignal", advantages_scale)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
                baselines=baselines_tensor,
                etas=etas,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std
                       for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in agent_infos
            ])

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list([
                tensor_utils.pad_tensor_dict(p, max_path_length)
                for p in env_infos
            ])

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            baselines_tensor = tensor_utils.pad_tensor_n(
                baselines, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(
                self.algo.policy.distribution.entropy(agent_infos) *
                valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
                baselines=baselines_tensor,
            )

        logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data
    def process_samples(self, itr, paths, prefix='', log=True):
        baselines = []
        returns = []

        for idx, path in enumerate(paths):
            path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount)
        if log:
            logger.log("fitting baseline...")
        if hasattr(self.algo.baseline, 'fit_with_samples'):
            self.algo.baseline.fit_with_samples(paths, samples_data)
        else:
            self.algo.baseline.fit(paths, log=log)
        if log:
            logger.log("fitted")


        if hasattr(self.algo.baseline, "predict_n"):
            all_path_baselines = self.algo.baseline.predict_n(paths)
        else:
            all_path_baselines = [self.algo.baseline.predict(path) for path in paths]

        for idx, path in enumerate(paths):
            path_baselines = np.append(all_path_baselines[idx], 0)
            deltas = path["rewards"] + \
                     self.algo.discount * path_baselines[1:] - \
                     path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.algo.discount * self.algo.gae_lambda)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        #ev = special.explained_variance_1d(
        #    np.concatenate(baselines),
        #    np.concatenate(returns)
        #)

        if not self.algo.policy.recurrent:
            observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths])
            returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths])
            advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths])

            if self.algo.center_adv:
                advantages = util.center_advantages(advantages)

            if self.algo.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            #ent = np.mean(self.algo.policy.distribution.entropy(agent_infos))

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                returns=returns,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = tensor_utils.pad_tensor_n(obs, max_path_length)

            if self.algo.center_adv:
                raw_adv = np.concatenate([path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [(path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.asarray([tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = tensor_utils.pad_tensor_n(actions, max_path_length)

            rewards = [path["rewards"] for path in paths]
            rewards = tensor_utils.pad_tensor_n(rewards, max_path_length)

            returns = [path["returns"] for path in paths]
            returns = tensor_utils.pad_tensor_n(returns, max_path_length)

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = tensor_utils.pad_tensor_n(valids, max_path_length)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids)

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                returns=returns,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )
        if log:
            #logger.record_tabular('Iteration', itr)
            #logger.record_tabular('AverageDiscountedReturn',
            #                      average_discounted_return)
            logger.record_tabular(prefix+'AverageReturn', np.mean(undiscounted_returns))
            #logger.record_tabular(prefix+'ExplainedVariance', ev)
            logger.record_tabular(prefix+'NumTrajs', len(paths))
            #logger.record_tabular(prefix+'Entropy', ent)
            #logger.record_tabular(prefix+'Perplexity', np.exp(ent))
            logger.record_tabular(prefix+'StdReturn', np.std(undiscounted_returns))
            logger.record_tabular(prefix+'MaxReturn', np.max(undiscounted_returns))
            logger.record_tabular(prefix+'MinReturn', np.min(undiscounted_returns))

        return samples_data
Exemple #20
0
    def process_samples(self, itr, paths):

        if self.normalize_reward:
            # Update reward mean/std Q.
            rewards = []
            for i in xrange(len(paths)):
                rewards.append(paths[i]['rewards'])
            rewards_flat = np.hstack(rewards)
            self._reward_mean.append(np.mean(rewards_flat))
            self._reward_std.append(np.std(rewards_flat))

            # Normalize rewards.
            reward_mean = np.mean(np.asarray(self._reward_mean))
            reward_std = np.mean(np.asarray(self._reward_std))
            for i in xrange(len(paths)):
                paths[i]['rewards'] = (
                    paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8)

        if itr > 0:
            kls = []
            for i in xrange(len(paths)):
                kls.append(paths[i]['KL'])

            kls_flat = np.hstack(kls)

            logger.record_tabular('Expl_MeanKL', np.mean(kls_flat))
            logger.record_tabular('Expl_StdKL', np.std(kls_flat))
            logger.record_tabular('Expl_MinKL', np.min(kls_flat))
            logger.record_tabular('Expl_MaxKL', np.max(kls_flat))

            # Perform normlization of the intrinsic rewards.
            if self.use_kl_ratio:
                if self.use_kl_ratio_q:
                    # Update kl Q
                    self.kl_previous.append(np.median(np.hstack(kls)))
                    previous_mean_kl = np.mean(np.asarray(self.kl_previous))
                    for i in xrange(len(kls)):
                        kls[i] = kls[i] / previous_mean_kl

            # Add KL ass intrinsic reward to external reward
            for i in xrange(len(paths)):
                paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i]

            # Discount eta
            self.eta *= self.eta_discount

        else:
            logger.record_tabular('Expl_MeanKL', 0.)
            logger.record_tabular('Expl_StdKL', 0.)
            logger.record_tabular('Expl_MinKL', 0.)
            logger.record_tabular('Expl_MaxKL', 0.)

        baselines = []
        returns = []
        for path in paths:
            path_baselines = np.append(self.baseline.predict(path), 0)
            deltas = path["rewards"] + \
                self.discount * path_baselines[1:] - \
                path_baselines[:-1]
            path["advantages"] = special.discount_cumsum(
                deltas, self.discount * self.gae_lambda)
            path["returns"] = special.discount_cumsum(
                path["rewards_orig"], self.discount)
            baselines.append(path_baselines[:-1])
            returns.append(path["returns"])

        if not self.policy.recurrent:
            observations = tensor_utils.concat_tensor_list(
                [path["observations"] for path in paths])
            actions = tensor_utils.concat_tensor_list(
                [path["actions"] for path in paths])
            rewards = tensor_utils.concat_tensor_list(
                [path["rewards"] for path in paths])
            advantages = tensor_utils.concat_tensor_list(
                [path["advantages"] for path in paths])
            env_infos = tensor_utils.concat_tensor_dict_list(
                [path["env_infos"] for path in paths])
            agent_infos = tensor_utils.concat_tensor_dict_list(
                [path["agent_infos"] for path in paths])

            if self.center_adv:
                advantages = util.center_advantages(advantages)

            if self.positive_adv:
                advantages = util.shift_advantages_to_positive(advantages)

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [
                sum(path["rewards_orig"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=observations,
                actions=actions,
                rewards=rewards,
                advantages=advantages,
                env_infos=env_infos,
                agent_infos=agent_infos,
                paths=paths,
            )
        else:
            max_path_length = max([len(path["advantages"]) for path in paths])

            # make all paths the same length (pad extra advantages with 0)
            obs = [path["observations"] for path in paths]
            obs = np.array(
                [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])

            if self.center_adv:
                raw_adv = np.concatenate(
                    [path["advantages"] for path in paths])
                adv_mean = np.mean(raw_adv)
                adv_std = np.std(raw_adv) + 1e-8
                adv = [
                    (path["advantages"] - adv_mean) / adv_std for path in paths]
            else:
                adv = [path["advantages"] for path in paths]

            adv = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in adv])

            actions = [path["actions"] for path in paths]
            actions = np.array(
                [tensor_utils.pad_tensor(a, max_path_length) for a in actions])

            rewards = [path["rewards"] for path in paths]
            rewards = np.array(
                [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])

            agent_infos = [path["agent_infos"] for path in paths]
            agent_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(
                    p, max_path_length) for p in agent_infos]
            )

            env_infos = [path["env_infos"] for path in paths]
            env_infos = tensor_utils.stack_tensor_dict_list(
                [tensor_utils.pad_tensor_dict(
                    p, max_path_length) for p in env_infos]
            )

            valids = [np.ones_like(path["returns"]) for path in paths]
            valids = np.array(
                [tensor_utils.pad_tensor(v, max_path_length) for v in valids])

            average_discounted_return = \
                np.mean([path["returns"][0] for path in paths])

            undiscounted_returns = [sum(path["rewards"]) for path in paths]

            ent = np.mean(self.policy.distribution.entropy(agent_infos))

            ev = special.explained_variance_1d(
                np.concatenate(baselines),
                np.concatenate(returns)
            )

            samples_data = dict(
                observations=obs,
                actions=actions,
                advantages=adv,
                rewards=rewards,
                valids=valids,
                agent_infos=agent_infos,
                env_infos=env_infos,
                paths=paths,
            )

        logger.log("fitting baseline...")
        self.baseline.fit(paths)
        logger.log("fitted")

        logger.record_tabular('Iteration', itr)
        logger.record_tabular('AverageDiscountedReturn',
                              average_discounted_return)
        logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
        logger.record_tabular('ExplainedVariance', ev)
        logger.record_tabular('NumTrajs', len(paths))
        logger.record_tabular('Entropy', ent)
        logger.record_tabular('Perplexity', np.exp(ent))
        logger.record_tabular('StdReturn', np.std(undiscounted_returns))
        logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
        logger.record_tabular('MinReturn', np.min(undiscounted_returns))

        return samples_data