def process_samples(self, itr, paths):
        print("process paths in npo_snn_rewards")
        # count visitations or whatever the bonus wants to do. This should not modify the paths
        for b_eval in self.bonus_evaluator:
            logger.log("fitting bonus evaluator before processing...")
            b_eval.fit_before_process_samples(paths)
            logger.log("fitted")
        # save real undiscounted reward before changing them
        undiscounted_returns = [sum(path["rewards"]) for path in paths]
        logger.record_tabular('TrueAverageReturn', np.mean(undiscounted_returns))
        # logger.record_tabular('Episodic_reward',
        #                       undiscounted_returns)
        # print("paths_len", len(paths))
        for path in paths:
            path['true_rewards'] = list(path['rewards'])

        # If using a latent regressor (and possibly adding MI to the reward):
        if isinstance(self.latent_regressor, Latent_regressor):
            with logger.prefix(' Latent_regressor '):
                self.latent_regressor.fit(paths)

                if self.reward_regressor_mi:
                    for i, path in enumerate(paths):
                        path['logli_latent_regressor'] = self.latent_regressor.predict_log_likelihood(
                            [path], [path['agent_infos']['latents']])[0]  # this is for paths usually..

                        path['rewards'] += self.reward_regressor_mi * path[
                            'logli_latent_regressor']  # the logli of the latent is the variable of the mutual information

        # for the extra bonus
        for b, b_eval in enumerate(self.bonus_evaluator):
            for i, path in enumerate(paths):
                bonuses = b_eval.predict(path)
                # if i == 0:
                #     print("path", path['actions'])
                # print("bonus", bonuses.shape)
                # print("reward", path['rewards'].shape)
                path['rewards'] += self.reward_coef_bonus[b] * bonuses

        real_samples = ext.extract_dict(
            BatchSampler.process_samples(self, itr, paths),
            # I don't need to process the hallucinated samples: the R, A,.. same!
            "observations", "actions", "advantages", "env_infos", "agent_infos"
        )
        real_samples["importance_weights"] = np.ones_like(real_samples["advantages"])
        # print("real_samples", real_samples)
        return real_samples
Beispiel #2
0
def process_path(paths, itr, low_sampler):
    paths_low = []
    for idx, path in enumerate(paths):
        obs_shape = path['env_infos']["full_path"]["observations"].shape[2]
        # print("obs_shape", path['env_infos']["full_path"]["observations"].shape)
        act_shape = path['env_infos']["full_path"]["actions"].shape[2]
        path_low = dict(
            observations=path['env_infos']["full_path"]["observations"].reshape([-1, obs_shape]),
            actions=path['env_infos']["full_path"]["actions"].reshape([-1, act_shape]),
            rewards=path['env_infos']["full_path"]["rewards"].reshape([-1]),
        )
        agent_info_low = dict()
        # print("obs_shape", path_low["observations"].shape)
        # print("act_shape", path_low["actions"].shape)
        # print("reward_shape", path_low["rewards"].shape)
        for key in path['env_infos']["full_path"]['agent_infos']:
            # print(key)
            new_shape = path['env_infos']["full_path"]["agent_infos"][key].shape[2]
            agent_info_low[key] = path['env_infos']["full_path"]['agent_infos'][key].reshape([-1, new_shape])
            # print(key, agent_info_low[key].shape)
        path_low["agent_infos"] = agent_info_low
        env_info_low = dict()
        for key in path['env_infos']["full_path"]['env_infos']:
            # print(key, path)
            if key == 'com':
                new_shape = path['env_infos']["full_path"]["env_infos"][key].shape[2]
                env_info_low[key] = path['env_infos']["full_path"]['env_infos'][key].reshape(
                    [-1, new_shape])
            else:
                env_info_low[key] = path['env_infos']["full_path"]['env_infos'][key].reshape(
                    [-1])
        path_low["env_infos"] = env_info_low

        paths_low.append(path_low)
    real_samples = ext.extract_dict(
        low_sampler.process_samples(itr, paths_low),
        # I don't need to process the hallucinated samples: the R, A,.. same!
        "observations", "actions", "advantages", "env_infos", "agent_infos"
    )
    real_samples["importance_weights"] = np.ones_like(real_samples["advantages"])

    return real_samples
Beispiel #3
0
    def train(self):
        self.start_worker()
        self.init_opt()
        # init_opt for low policy
        if self.train_low:
            self.init_opt_low()
        high_times = 0
        obs_concat = adv_concat = lat_concat = pro_concat = act_concat = np.array(
            [])
        start_i = 0
        #for itr in range(self.current_itr, self.n_itr):
        for itr in range(start_i, self.n_itr):
            gc.collect()  # force freeing memory
            if self.transfer and itr == start_i:
                self.warm_start()

            with logger.prefix('itr #%d | ' % itr):
                if self.step_anneal:
                    self.anneal_step_num(itr)  # update the step length
                paths = self.sampler.obtain_samples(itr)
                self.discount = self.discount_high  # change discount every time we train high-level policy!
                samples_data = self.sampler.process_samples(itr, paths)
                self.log_diagnostics(paths)

                if self.train_high == True:  # train the high level policy
                    if self.train_high_every and self.train_high_every != 1:
                        # train high every is the time we train the low level per train of high level
                        if high_times < self.train_high_every:
                            if high_times == 0:  # initialize concat vars
                                obs_concat, act_concat, adv_concat = ext.extract(
                                    samples_data, "observations", "actions",
                                    "advantages")
                                pro_concat = samples_data["agent_infos"][
                                    'prob']
                                lat_concat = samples_data["agent_infos"][
                                    'latents']
                            else:
                                ## below: how should samples be concatenated:
                                obs_tmp, act_tmp, adv_tmp = ext.extract(
                                    samples_data, "observations", "actions",
                                    "advantages")
                                pro_tmp = samples_data["agent_infos"]['prob']
                                lat_tmp = samples_data["agent_infos"][
                                    'latents']
                                obs_concat = np.concatenate(
                                    (obs_concat, obs_tmp), axis=0)
                                act_concat = np.concatenate(
                                    (act_concat, act_tmp), axis=0)
                                adv_concat = np.concatenate(
                                    (adv_concat, adv_tmp), axis=0)
                                pro_concat = np.concatenate(
                                    (pro_concat, pro_tmp), axis=0)
                                lat_concat = np.concatenate(
                                    (lat_concat, lat_tmp), axis=0)

                            ## above: how should samples be concatenated
                        if high_times == self.train_high_every:
                            high_times = 0
                            samples_data_concatenated = {
                                'observations': obs_concat,
                                'actions': act_concat,
                                'advantages': adv_concat,
                                'agent_infos': {
                                    'prob': pro_concat,
                                    'latents': lat_concat
                                }
                            }
                            print("training high policy")
                            self.optimize_policy(itr,
                                                 samples_data_concatenated)
                        high_times += 1
                    else:
                        self.optimize_policy(itr, samples_data)

                if not self.train_low:
                    pass  # not training low policy

                elif self.train_low_with_external:
                    print("training low policy with external rewards only")
                    paths_low = []
                    for idx, path in enumerate(paths):
                        last_low_step_num = len(
                            path["env_infos"]["full_path"]["rewards"][-1])

                        path_low = dict(
                            observations=np.concatenate(
                                path['env_infos']["full_path"]
                                ["observations"]),
                            actions=np.concatenate(
                                path['env_infos']["full_path"]["actions"]),
                            rewards=np.concatenate(
                                path['env_infos']["full_path"]["rewards"]),
                        )

                        # WR: trim the observation
                        path_low['observations'] = path_low[
                            'observations'][:, :self.low_policy.obs_robot_dim]
                        agent_info_low = dict()
                        for key in path['env_infos']["full_path"][
                                'agent_infos']:
                            agent_info_low[key] = np.concatenate(
                                path['env_infos']["full_path"]['agent_infos']
                                [key])
                        path_low["agent_infos"] = agent_info_low
                        env_info_low = dict()
                        for key in path['env_infos']["full_path"]['env_infos']:
                            # print(key, path)
                            env_info_low[key] = np.concatenate(
                                path['env_infos']["full_path"]["env_infos"]
                                [key])
                        path_low["env_infos"] = env_info_low

                        paths_low.append(path_low)
                    real_samples = ext.extract_dict(
                        self.low_sampler.process_samples(itr, paths_low),
                        # I don't need to process the hallucinated samples: the R, A,.. same!
                        "observations",
                        "actions",
                        "advantages",
                        "env_infos",
                        "agent_infos")
                    real_samples["importance_weights"] = np.ones_like(
                        real_samples["advantages"])
                    self.optimize_policy_low(itr, real_samples)

                elif self.train_low_with_v_split:
                    print("training low policy with HAAR")
                    # self.discount = self.discount_low
                    paths_low = []
                    for idx, path in enumerate(paths):
                        last_low_step_num = len(
                            path["env_infos"]["full_path"]["rewards"][-1])
                        V_high = self.baseline.predict(path)
                        diff_V = np.diff(
                            V_high
                        ) / self.env.time_steps_agg  # here we are neglecting gamma in the definition
                        # of Advantage (gamma is close to 1), making the expression essentially the difference in V.
                        # Using the precise definition of A will yield very similar learning curves and does not affect
                        # the outcome of experiments.

                        for i in range(len(diff_V)):
                            # path["env_infos"]["full_path"]["rewards"][i] \
                            #     += np.ones(len(path["env_infos"]["full_path"]["rewards"][i]))*diff_V[i]
                            path["env_infos"]["full_path"]["rewards"][i] \
                                = np.ones(len(path["env_infos"]["full_path"]["rewards"][i])) * diff_V[i]

                        path_low = dict(
                            observations=np.concatenate(
                                path['env_infos']["full_path"]
                                ["observations"]),
                            actions=np.concatenate(
                                path['env_infos']["full_path"]["actions"]),
                            rewards=np.concatenate(
                                path['env_infos']["full_path"]["rewards"]),
                        )

                        # cancel the winning rewards for low level!
                        if np.sum(
                                path['env_infos']["full_path"]['env_infos']
                            ['inner_rew']) == 1:  # the episode was successful
                            # the last step should minus the reward of reaching the goal (outer reward)
                            path_low['rewards'][
                                -1] -= self.env.wrapped_env.wrapped_env.goal_rew

                        # WR: trim the observation
                        path_low['observations'] = path_low[
                            'observations'][:, :self.low_policy.obs_robot_dim]
                        agent_info_low = dict()
                        for key in path['env_infos']["full_path"][
                                'agent_infos']:
                            agent_info_low[key] = np.concatenate(
                                path['env_infos']["full_path"]['agent_infos']
                                [key])
                        path_low["agent_infos"] = agent_info_low
                        env_info_low = dict()
                        for key in path['env_infos']["full_path"]['env_infos']:
                            # print(key, path)
                            env_info_low[key] = np.concatenate(
                                path['env_infos']["full_path"]["env_infos"]
                                [key])
                        path_low["env_infos"] = env_info_low

                        paths_low.append(path_low)
                    real_samples = ext.extract_dict(
                        self.low_sampler.process_samples(itr, paths_low),
                        # I don't need to process the hallucinated samples: the R, A,.. same!
                        "observations",
                        "actions",
                        "advantages",
                        "env_infos",
                        "agent_infos")
                    real_samples["importance_weights"] = np.ones_like(
                        real_samples["advantages"])
                    self.optimize_policy_low(itr, real_samples)

                else:
                    print(
                        'ERROR! Unknown training mode. See batch_polopt.py for details.'
                    )

                logger.log("saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                self.current_itr = itr + 1
                params["algo"] = self
                try:
                    params["time_steps_agg"] = self.env.time_steps_agg
                except AttributeError:  # don't have this attribute
                    pass
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
                # to prevent memory leakage
                # info = psutil.virtual_memory()
                # print ('memory percent', info.percent)
                # if info.percent > 95:
                #     break

                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        self.shutdown_worker()