Beispiel #1
0
    def __init__(
            self,
            manager_optimizer=None,
            optimizer=None,
            snn_optimizer=None,
            optimizer_args=None,
            step_size=1e-6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(
                learning_rate=step_size,
                **optimizer_args)  # I hope this is right
        self.manager_optimizer = manager_optimizer
        self.snn_optimizer = snn_optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []
    def __init__(self,
                 optimizer=None,
                 optimizer_args=None,
                 step_size=0.003,
                 num_latents=6,
                 latents=None,  # some sort of iterable of the actual latent vectors
                 period=10,  # how often I choose a latent
                 truncate_local_is_ratio=None,
                 epsilon=0.1,
                 train_pi_iters=10,
                 use_skill_dependent_baseline=False,
                 mlp_skill_dependent_baseline=False,
                 freeze_manager=False,
                 freeze_skills=False,
                 **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size, max_epochs=train_pi_iters, **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Concurrent_PPO, self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi * (self.num_latents + 1) + self.num_latents,)
            skill_dependent_obs_space = Box(-1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space, skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(env_spec=skill_dependent_env_spec)
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=1e-2,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            use_skill_dependent_baseline=False,
            **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(batch_size=None, max_epochs=1)
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                            **optimizer_args)
        self.optimizer = optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent_approx,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period

        self.trainable_manager = self.policy.trainable_manager

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = (
                (curr_env.observation_space.shape[0] + 1) * self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space,
                                                skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(
                env_spec=skill_depdendent_env_spec)
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.0003,
            latents=None,  # some sort of iterable of the actual latent vectors
            average_period=10,  # average over all the periods
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=80,
            use_skill_dependent_baseline=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Hippo,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.average_period = average_period

        # import pdb; pdb.set_trace()
        self.sampler = BatchSampler(self)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []
        self.use_skill_dependent_baseline = use_skill_dependent_baseline

        assert isinstance(self.policy, HierarchicalPolicy)
        self.old_policy = copy.deepcopy(self.policy)
class GaussianMLPPolicy_snn_hier(StochasticPolicy, LasagnePowered, Serializable):  # also inherits form Parametrized
    @autoargs.arg('hidden_sizes', type=int, nargs='*',
                  help='list of sizes for the fully-connected hidden layers')
    @autoargs.arg('std_sizes', type=int, nargs='*',
                  help='list of sizes for the fully-connected layers for std, note'
                       'there is a difference in semantics than above: here an empty'
                       'list means that std is independent of input and the last size is ignored')
    @autoargs.arg('initial_std', type=float,
                  help='Initial std')
    @autoargs.arg('std_trainable', type=bool,
                  help='Is std trainable')
    @autoargs.arg('output_nl', type=str,
                  help='nonlinearity for the output layer')
    @autoargs.arg('nonlinearity', type=str,
                  help='nonlinearity used for each hidden layer, can be one '
                       'of tanh, sigmoid')
    @autoargs.arg('bn', type=bool,
                  help='whether to apply batch normalization to hidden layers')
    def __init__(
            self,
            env_spec,
            env,
            pkl_path=None,
            json_path=None,
            npz_path=None,
            trainable_snn=True,
            ##CF - latents units at the input
            latent_dim=3,  # we keep all these as the dim of the output of the other MLP and others that we will need!
            latent_name='categorical',
            bilinear_integration=False,  # again, needs to match!
            resample=False,  # this can change: frequency of resampling the latent?
            hidden_sizes_snn=(32, 32),
            hidden_sizes_selector=(10, 10),
            external_latent=False,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            min_std=1e-4,
    ):
        self.latent_dim = latent_dim  ## could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes_snn = hidden_sizes_snn
        self.hidden_sizes_selector = hidden_sizes_selector

        self.pre_fix_latent = np.array([])  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array([])  # this will hold the latents variable sampled in reset()
        self.shared_latent_var = theano.shared(self.latent_fix)  # this is for external lat! update that
        self._set_std_to_0 = False

        self.trainable_snn = trainable_snn
        self.external_latent = external_latent
        self.pkl_path = pkl_path
        self.json_path = json_path
        self.npz_path = npz_path
        self.old_policy = None

        if self.json_path:  # there is another one after defining all the NN to warm-start the params of the SNN
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, self.json_path), 'r'))  # I should do this with the json file
            self.old_policy_json = data['json_args']["policy"]
            self.latent_dim = self.old_policy_json['latent_dim']
            self.latent_name = self.old_policy_json['latent_name']
            self.bilinear_integration = self.old_policy_json['bilinear_integration']
            self.resample = self.old_policy_json['resample']  # this could not be needed...
            self.min_std = self.old_policy_json['min_std']
            self.hidden_sizes_snn = self.old_policy_json['hidden_sizes']
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if self.latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim))
        elif self.latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif self.latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))  # this is an empty array
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.external_latent:  # in case we want to fix the latent externally
            l_all_obs_var = L.InputLayer(shape=(None,) + (self.obs_robot_dim + self.obs_maze_dim,))
            all_obs_var = l_all_obs_var.input_var
            # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var)
            l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var,
                                     trainable=False) # Rui: change False to True? this is a simple layer that directly outputs self.shared_latent_var
            selection_var = L.get_output(l_selection)

        else:
            # create network with softmax output: it will be the latent 'selector'!
            latent_selection_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim,),
                output_dim=self.latent_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = latent_selection_network.input_layer
            all_obs_var = latent_selection_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to latents)
            l_selection = latent_selection_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None)
        # for _ in range(10):
        #     print("OK!")
        # print(self.obs_robot_dim)
        # print(self.obs_maze_dim)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # Enlarge obs with the selectors (or latents). Here just computing the final input dim
        if self.bilinear_integration:
            l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection])
        else:
            l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection])

        action_dim = env_spec.action_space.flat_dim

        # create the action network
        mean_network = MLP(
            input_layer=l_obs_snn,  # input the layer that handles the integration of the selector
            output_dim=action_dim,
            hidden_sizes=self.hidden_sizes_snn,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer

        if adaptive_std:
            log_std_network = MLP(
                input_layer=l_obs_snn,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
                name="log_stdMLP"
            )
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if not self.trainable_snn:
            for layer in self._layers_snn:
                for param, tags in layer.params.items():  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_path and self.npz_path:
            warm_params_dict = dict(np.load(os.path.join(config.PROJECT_PATH, self.npz_path)))
            # keys = list(param_dict.keys())
            self.set_params_snn(warm_params_dict)
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec)

        # debug
        obs_snn_var = L.get_output(l_obs_snn)
        self._l_obs_snn = ext.compile_function(
            inputs=[all_obs_var],
            outputs=obs_snn_var,
        )
        # self._log_std = ext.compile_function(
        #     inputs=[all_obs_var],
        #     outputs=log_std_var,
        # )
        self._mean = ext.compile_function(
            inputs=[all_obs_var],
            outputs=mean_var,
        )

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )

    # # I shouldn't need the latent space anymore
    @property
    def latent_space(self):
        return Box(low=-np.inf, high=np.inf, shape=(1,))

    def get_params_snn(self):
        params = []
        for layer in self._layers_snn:
            params += layer.get_params()
        return params

    # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names
    def set_params_snn(self, snn_params):
        if type(
                snn_params) is dict:  # if the snn_params are a dict with the param name as key and a numpy array as value
            params_value_by_name = snn_params
        elif type(snn_params) is list:  # if the snn_params are a list of theano variables  **NOT CHECKING THIS!!**
            params_value_by_name = {}
            for param in snn_params:
                # print("old", param.name)
                params_value_by_name[param.name] = param.get_value()
        else:
            params_value_by_name = {}
            print("The snn_params was not understood!")

        local_params = self.get_params_snn()
        for param in local_params:
            # print("new", param.name)
            param.set_value(params_value_by_name[param.name])

    def dist_info_sym(self, obs_var, state_info_var=None):
        mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], obs_var)
        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))
        return dict(mean=mean_var, log_std=log_std_var)

    @overrides
    def get_action(self, observation):
        # print("obeservation", len(observation))
        # print("env", env)
        actions, outputs = self.get_actions([observation])
        return actions[0], {k: v[0] for k, v in outputs.items()}

    def get_actions(self, observations):

        selector_output = self._f_select(observations)
        # print("obeservation", len(observations))
        mean, log_std = self._f_dist(observations)

        if self._set_std_to_0:
            actions = mean
            log_std = -1e6 * np.ones_like(log_std)
        else:
            rnd = np.random.normal(size=mean.shape)
            actions = rnd * np.exp(log_std) + mean
        return actions, dict(mean=mean, log_std=log_std, latents=selector_output)

    def set_pre_fix_latent(self, latent):
        self.pre_fix_latent = np.array(latent)

    def unset_pre_fix_latent(self):
        self.pre_fix_latent = np.array([])

    @contextmanager
    def fix_latent(self, latent):
        self.pre_fix_latent = np.array(latent)
        yield
        self.pre_fix_latent = np.array([])

    @contextmanager
    def set_std_to_0(self):
        self._set_std_to_0 = True
        yield
        self._set_std_to_0 = False

    @overrides
    def reset(self):  # executed at the start of every rollout. Will fix the latent if needed.
        if not self.resample:
            if self.pre_fix_latent.size > 0:
                self.latent_fix = self.pre_fix_latent
            else:
                self.latent_fix = self.latent_dist.sample(self.latent_dist_info)
        else:
            pass
        # this is needed for the external latent!!
        self.shared_latent_var.set_value(np.array(self.latent_fix))

    def log_diagnostics(self, paths):
        log_stds = np.vstack([path["agent_infos"]["log_std"] for path in paths])
        logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds)))
        logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds)))
        logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))

    @property
    def distribution(self):
        """
        We set the distribution to the policy itself since we need some behavior different from a usual diagonal
        Gaussian distribution.
        """
        return self._dist

    def log_likelihood(self, actions, agent_infos, action_only=True):
        # First compute logli of the action. This assumes the latents FIX to whatever was sampled, and hence we only
        # need to use the mean and log_std, but not any information about the latents
        logli = self._dist.log_likelihood(actions, agent_infos)
        if not action_only:
            raise NotImplementedError
            #   if not action_only:
            #       for idx, latent_name in enumerate(self._latent_distributions):
            #           latent_var = dist_info["latent_%d" % idx]
            #           prefix = "latent_%d_" % idx
            #           latent_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith(
            #               prefix)}
            #           logli += latent_name.log_likelihood(latent_var, latent_dist_info)
        return logli
Beispiel #6
0
class PG_concurrent(BatchPolopt):
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(
            self,
            manager_optimizer=None,
            optimizer=None,
            snn_optimizer=None,
            optimizer_args=None,
            step_size=1e-6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(
                learning_rate=step_size,
                **optimizer_args)  # I hope this is right
        self.manager_optimizer = manager_optimizer
        self.snn_optimizer = snn_optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time

    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )

        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.
            floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        # obs_var = obs_var_raw

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = [
            TT.exp(self.diagonal.log_likelihood_sym(action_var, dist_info))
            for dist_info in dist_info_vars
        ]

        # need to reshape at the end
        reshaped_probs = [
            TT.reshape(prob, [obs_var.shape[0] // self.period, self.period])
            for prob in probs
        ]

        # now, multiply out each row and concatenate
        subtrajectory_probs = TT.stack([
            TT.prod(reshaped_prob, axis=1) for reshaped_prob in reshaped_probs
        ],
                                       axis=1)
        # shape error might come out of here

        # elementwise multiplication, then sum up each individual row and take log
        likelihood = TT.log(TT.sum(subtrajectory_probs * latent_probs, axis=1))

        surr_loss = -TT.mean(likelihood * advantage_var)

        input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    #do the optimization
    def optimize_policy(self, itr, samples_data):
        assert len(samples_data) // self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        input_values = tuple(
            ext.extract(samples_data, "observations", "actions", "advantages"))
        # print(input_values[0].shape)

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])
        # obs_raw = input_values[0]

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        advantage_sparse = np.sum(input_values[2].reshape(
            [input_values[2].shape[0] // self.period, self.period]),
                                  axis=1)
        all_input_values = (obs_raw, obs_sparse, input_values[1],
                            advantage_sparse)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def optimize_manager(self, itr, samples_data):
        pass

    def optimize_snn(self, itr, samples_data):
        pass

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        #paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Beispiel #7
0
class Hippo(BatchPolopt):
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.0003,
            latents=None,  # some sort of iterable of the actual latent vectors
            average_period=10,  # average over all the periods
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=80,
            use_skill_dependent_baseline=False,
            mlp_skill_dependent_baseline=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(Hippo,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.average_period = average_period

        # import pdb; pdb.set_trace()
        # self.sampler = BatchSampler(self)
        self.sampler = HierBatchSampler(self, period=None)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicyRandomTime)
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[
                0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi *
                                             (self.num_latents + 1) +
                                             self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space,
                                               skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(
                    env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(
                    env_spec=skill_dependent_env_spec)

    def init_opt(self):
        obs_var = ext.new_tensor(
            'obs', ndim=2, dtype=theano.config.floatX)  # todo: check the dtype

        manager_obs_var = ext.new_tensor('manager_obs',
                                         ndim=2,
                                         dtype=theano.config.floatX)

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every time the manager makes a decision
        manager_advantage_var = ext.new_tensor('manager_advantage',
                                               ndim=1,
                                               dtype=theano.config.floatX)

        skill_advantage_var = ext.new_tensor('skill_advantage',
                                             ndim=1,
                                             dtype=theano.config.floatX)

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)

        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        manager_prob_var = ext.new_tensor('log_std',
                                          ndim=2,
                                          dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            manager_obs_var)['prob']
        # old_latent_probs = self.old_policy.manager.dist_info_sym(manager_obs_var)['prob']

        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        old_actual_latent_probs = TT.sum(manager_prob_var * latent_var_sparse,
                                         axis=1)
        lr = TT.exp(
            TT.log(actual_latent_probs) - TT.log(old_actual_latent_probs))
        manager_surr_loss_vector = TT.minimum(
            lr * manager_advantage_var,
            TT.clip(lr, 1 - self.epsilon, 1 + self.epsilon) *
            manager_advantage_var)
        manager_surr_loss = -TT.mean(manager_surr_loss_vector)

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)

        skill_surr_loss_vector = TT.minimum(
            skill_lr * skill_advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            skill_advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = manager_surr_loss / self.average_period + skill_surr_loss

        input_list = [
            obs_var, manager_obs_var, action_var, manager_advantage_var,
            skill_advantage_var, latent_var, latent_var_sparse, mean_var,
            log_std_var, manager_prob_var
        ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        # print(len(samples_data['observations']), self.period)
        # assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse
        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        time_remaining = input_values[3]['time_remaining']
        resampled_period = input_values[3]['resampled_period']
        obs_var = np.insert(input_values[0],
                            self.policy.obs_robot_dim,
                            time_remaining,
                            axis=1)
        manager_obs_var = obs_var[resampled_period]
        action_var = input_values[1]
        manager_adv_var = input_values[2][resampled_period]

        latent_var = input_values[3]['latents']
        latent_var_sparse = latent_var[resampled_period]
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']
        prob = input_values[3]['prob'][resampled_period]
        if self.use_skill_dependent_baseline:
            skill_adv_var = input_values[4]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)
        else:
            skill_adv_var = input_values[2]
            all_input_values = (obs_var, manager_obs_var, action_var,
                                manager_adv_var, skill_adv_var, latent_var,
                                latent_var_sparse, mean, log_std, prob)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
Beispiel #8
0
    def __init__(
            self,
            env_spec,
            mean_hidden_nonlinearity=tf.nn.relu,
            mean_hidden_sizes=(32, 32),
            std_hidden_nonlinearity=tf.nn.relu,
            std_hidden_sizes=(32, 32),
            min_std=1e-6,
    ):
        """
        :param env_spec:
        :param mean_hidden_nonlinearity: nonlinearity used for the mean hidden
                                         layers
        :param mean_hidden_sizes: list of hidden_sizes for the fully-connected hidden layers
        :param std_hidden_nonlinearity: nonlinearity used for the std hidden
                                        layers
        :param std_hidden_sizes: list of hidden_sizes for the fully-connected hidden layers
        :param min_std: whether to make sure that the std is at least some
                        threshold value, to avoid numerical issues
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self.env_spec = env_spec
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.2)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        # Create network
        observation_dim = self.env_spec.observation_space.flat_dim
        self.observations_input = tf.placeholder(tf.float32,
                                                 shape=[None, observation_dim])
        action_dim = self.env_spec.action_space.flat_dim
        with tf.variable_scope('mean') as _:
            mlp_mean_output = tf_util.mlp(self.observations_input,
                                          observation_dim,
                                          mean_hidden_sizes,
                                          mean_hidden_nonlinearity)
            mlp_mean_output_size = mean_hidden_sizes[-1]
            self.mean = tf_util.linear(mlp_mean_output,
                                       mlp_mean_output_size,
                                       action_dim)

        with tf.variable_scope('log_std') as _:
            mlp_std_output = tf_util.mlp(self.observations_input,
                                         observation_dim,
                                         std_hidden_sizes,
                                         std_hidden_nonlinearity)
            mlp_std_output_size = std_hidden_sizes[-1]
            self.log_std = tf_util.linear(mlp_std_output,
                                          mlp_std_output_size,
                                          action_dim)
            self.std = tf.maximum(tf.exp(self.log_std), min_std)

        self._dist = DiagonalGaussian(action_dim)

        self.actions_output = tf.placeholder(tf.float32, shape=[None, action_dim])
        z = (self.actions_output - self.mean) / self.std
        self.log_likelihood = (- tf.log(self.std**2)
                               - z**2 * 0.5
                               - tf.log(2*np.pi) * 0.5)
Beispiel #9
0
    def __init__(
        self,
        env_spec,
        env,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
        pkl_path=None,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        self.pkl_path = pkl_path

        if self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions from env!
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.bilinear_integration:
            obs_dim = self.obs_robot_dim + self.latent_dim +\
                      self.obs_robot_dim * self.latent_dim
        else:
            obs_dim = self.obs_robot_dim + self.latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        # for _ in range(10):
        #     print("OK!")
        # print(obs_dim)
        # print(env_spec.observation_space.flat_dim)
        # print(self.latent_dim)

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            log_std_network = MLP(input_shape=(obs_dim, ),
                                  input_var=obs_var,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if self.pkl_path:  # restore from pkl file
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Beispiel #10
0
class GaussianMLPPolicy_snn_restorable(StochasticPolicy, LasagnePowered,
                                       Serializable):
    """
    This stochastic policy allows to pick the latent distribution (Categorical in the paper), its dimension and
    its integration with the observations.
    """
    @autoargs.arg('hidden_sizes',
                  type=int,
                  nargs='*',
                  help='list of sizes for the fully-connected hidden layers')
    @autoargs.arg(
        'std_sizes',
        type=int,
        nargs='*',
        help='list of sizes for the fully-connected layers for std, note'
        'there is a difference in semantics than above: here an empty'
        'list means that std is independent of input and the last size is ignored'
    )
    @autoargs.arg('initial_std', type=float, help='Initial std')
    @autoargs.arg('std_trainable', type=bool, help='Is std trainable')
    @autoargs.arg('output_nl',
                  type=str,
                  help='nonlinearity for the output layer')
    @autoargs.arg('nonlinearity',
                  type=str,
                  help='nonlinearity used for each hidden layer, can be one '
                  'of tanh, sigmoid')
    @autoargs.arg('bn',
                  type=bool,
                  help='whether to apply batch normalization to hidden layers')
    def __init__(
        self,
        env_spec,
        env,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
        pkl_path=None,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        self.pkl_path = pkl_path

        if self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions from env!
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.bilinear_integration:
            obs_dim = self.obs_robot_dim + self.latent_dim +\
                      self.obs_robot_dim * self.latent_dim
        else:
            obs_dim = self.obs_robot_dim + self.latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        # for _ in range(10):
        #     print("OK!")
        # print(obs_dim)
        # print(env_spec.observation_space.flat_dim)
        # print(self.latent_dim)

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            log_std_network = MLP(input_shape=(obs_dim, ),
                                  input_var=obs_var,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if self.pkl_path:  # restore from pkl file
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

# #  this is currently not used, although it could, in dist_info_sym and in get_actions. Also we could refactor all..
#         # this would actually be WRONG with the current obs_var definition
#         latent_var = Box(low=-np.inf, high=np.inf, shape=(1,)).new_tensor_variable('latents', extra_dims=1)
#
#         extended_obs_var = TT.concatenate([obs_var, latent_var,
#                                            TT.flatten(obs_var[:, :, np.newaxis] * latent_var[:, np.newaxis, :],
#                                                       outdim=2)]
#                                           , axis=1)
#         self._extended_obs_var = ext.compile_function(
#             inputs=[obs_var, latent_var],
#             outputs=[extended_obs_var]
#         )

    @property
    def latent_space(self):
        return Box(low=-np.inf, high=np.inf, shape=(1, ))

    # the mean and var now also depend on the particular latents sampled
    def dist_info_sym(self,
                      obs_var,
                      latent_var=None):  # this is ment to be for one path!
        # now this is not doing anything! And for computing the dist_info_vars of npo_snn_rewardMI it doesn't work
        # for _ in range(10):
        #     print("OK")
        # print(obs_var)
        # obs_var = [obs_var[i][:self.obs_robot_dim] for i in range(obs_var.shape[0])]  # trim the observations

        if latent_var is None:
            latent_var1 = theano.shared(
                np.expand_dims(self.latent_fix, axis=0)
            )  # new fix to avoid putting the latent as an input: just take the one fixed!
            latent_var = TT.tile(latent_var1, [obs_var.shape[0], 1])

        # generate the generalized input (append latents to obs.)
        if self.bilinear_integration:
            extended_obs_var = TT.concatenate([
                obs_var, latent_var,
                TT.flatten(
                    obs_var[:, :, np.newaxis] * latent_var[:, np.newaxis, :],
                    outdim=2)
            ],
                                              axis=1)
        else:
            extended_obs_var = TT.concatenate([obs_var, latent_var], axis=1)
        mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std],
                                             extended_obs_var)
        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))
        return dict(mean=mean_var, log_std=log_std_var)

    @overrides
    def get_action(self, observation):
        actions, outputs = self.get_actions([observation])
        return actions[0], {k: v[0] for k, v in outputs.items()}

    def get_actions(self, observations):
        # observations: [ndarray]
        observations = [observations[0][:self.obs_robot_dim]]
        observations = np.array(
            observations)  # needed to do the outer product for the bilinear
        # print(observations)
        if self.latent_dim:
            if self.resample:
                latents = [
                    self.latent_dist.sample(self.latent_dist_info)
                    for _ in observations
                ]
                print('resampling the latents')
            else:
                if not np.size(
                        self.latent_fix
                ) == self.latent_dim:  # we decide to reset based on if smthing in the fix
                    self.reset()
                if len(
                        self.pre_fix_latent
                ) == self.latent_dim:  # If we have a pre_fix, reset will put the latent to it
                    self.reset(
                    )  # this overwrites the latent sampled or in latent_fix
                latents = np.tile(self.latent_fix,
                                  [len(observations), 1
                                   ])  # maybe a broadcast operation better...
            if self.bilinear_integration:
                extended_obs = np.concatenate([
                    observations, latents,
                    np.reshape(
                        observations[:, :, np.newaxis] *
                        latents[:, np.newaxis, :], (observations.shape[0], -1))
                ],
                                              axis=1)
                # print("obs:", observations.shape) # 1*47
                # print("latents:", latents.shape) # 1*6
                # print("extended obs:", extended_obs.shape) # 1*335
            else:
                extended_obs = np.concatenate([observations, latents], axis=1)
        else:
            latents = np.array([[]] * len(observations))
            extended_obs = observations
        # make mean, log_std also depend on the latents (as observ.)
        mean, log_std = self._f_dist(extended_obs)
        # print("log_std", log_std)

        if self._set_std_to_0:
            actions = mean
            log_std = -1e6 * np.ones_like(log_std)
        else:
            rnd = np.random.normal(size=mean.shape)
            actions = rnd * np.exp(log_std) + mean
        return actions, dict(mean=mean, log_std=log_std, latents=latents)

    def get_params_snn(self):
        params = []
        for layer in self._layers_snn:
            params += layer.get_params()
        return params

    # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names
    def set_params_snn(self, snn_params):
        if type(
                snn_params
        ) is dict:  # if the snn_params are a dict with the param name as key and a numpy array as value
            params_value_by_name = snn_params
        elif type(
                snn_params
        ) is list:  # if the snn_params are a list of theano variables  **NOT CHECKING THIS!!**
            params_value_by_name = {}
            for param in snn_params:
                # print("old", param.name)
                params_value_by_name[param.name] = param.get_value()
        else:
            params_value_by_name = {}
            print("The snn_params was not understood!")

        local_params = self.get_params_snn()
        for param in local_params:
            # print("new", param.name)
            param.set_value(params_value_by_name[param.name])

    def set_pre_fix_latent(self, latent):
        self.pre_fix_latent = np.array(latent)

    def unset_pre_fix_latent(self):
        self.pre_fix_latent = np.array([])

    @contextmanager
    def fix_latent(self, latent):
        self.pre_fix_latent = np.array(latent)
        yield
        self.pre_fix_latent = np.array([])

    @contextmanager
    def set_std_to_0(self):
        self._set_std_to_0 = True
        yield
        self._set_std_to_0 = False

    @overrides
    def reset(
        self,
        force_resample_lat=False
    ):  # executed at the start of every rollout. Will fix the latent if needed.
        if not self.resample and self.latent_dim:
            if self.pre_fix_latent.size > 0 and not force_resample_lat:
                self.latent_fix = self.pre_fix_latent
            else:
                self.latent_fix = self.latent_dist.sample(
                    self.latent_dist_info)
        else:
            pass

    def log_diagnostics(self, paths):
        log_stds = np.vstack(
            [path["agent_infos"]["log_std"] for path in paths])
        logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds)))
        logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds)))
        logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))

    @property
    def distribution(self):
        return self._dist

    def log_likelihood(self, actions, agent_infos, action_only=True):
        # First compute logli of the action. This assumes the latents FIX to whatever was sampled, and hence we only
        # need to use the mean and log_std, but not any information about the latents
        logli = self._dist.log_likelihood(actions, agent_infos)
        if not action_only:
            raise NotImplementedError
            #   if not action_only:
            #       for idx, latent_name in enumerate(self._latent_distributions):
            #           latent_var = dist_info["latent_%d" % idx]
            #           prefix = "latent_%d_" % idx
            #           latent_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith(
            #               prefix)}
            #           logli += latent_name.log_likelihood(latent_var, latent_dist_info)
        return logli
Beispiel #11
0
class ConcurrentContinuousPPO(BatchPolopt):
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=0.003,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            epsilon=0.1,
            train_pi_iters=10,
            use_skill_dependent_baseline=False,
            mlp_skill_dependent_baseline=False,
            freeze_manager=False,
            freeze_skills=False,
            **kwargs):
        if optimizer is None:
            if optimizer_args is None:
                # optimizer_args = dict()
                optimizer_args = dict(batch_size=None)
            self.optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                                 max_epochs=train_pi_iters,
                                                 **optimizer_args)
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        self.epsilon = epsilon

        super(ConcurrentContinuousPPO,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period
        self.freeze_manager = freeze_manager
        self.freeze_skills = freeze_skills
        assert (not freeze_manager) or (not freeze_skills)

        # todo: fix this sampler stuff
        # import pdb; pdb.set_trace()
        self.sampler = HierBatchSampler(self, self.period)
        # self.sampler = BatchSampler(self)
        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        self.period = self.policy.period
        assert self.policy.period == self.period
        self.continuous_latent = self.policy.continuous_latent
        assert self.continuous_latent
        # self.old_policy = copy.deepcopy(self.policy)

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        self.mlp_skill_dependent_baseline = mlp_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            new_obs_space_no_bi = curr_env.observation_space.shape[
                0] + 1  # 1 for the t_remaining
            skill_dependent_obs_space_dim = (new_obs_space_no_bi *
                                             (self.num_latents + 1) +
                                             self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_dependent_env_spec = EnvSpec(skill_dependent_obs_space,
                                               skill_dependent_action_space)
            if self.mlp_skill_dependent_baseline:
                self.skill_dependent_baseline = GaussianMLPBaseline(
                    env_spec=skill_dependent_env_spec)
            else:
                self.skill_dependent_baseline = LinearFeatureBaseline(
                    env_spec=skill_dependent_env_spec)

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time
    # assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid
    def init_opt(self):
        assert isinstance(self.policy, HierarchicalPolicy)
        assert not self.freeze_manager and not self.freeze_skills
        manager_surr_loss = 0
        # skill_surr_loss = 0

        obs_var_sparse = ext.new_tensor('sparse_obs',
                                        ndim=2,
                                        dtype=theano.config.floatX)
        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype
        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )
        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)
        # latent_var = ext.new_tensor('latents', ndim=2, dtype=theano.config.floatX)
        mean_var = ext.new_tensor('mean', ndim=2, dtype=theano.config.floatX)
        log_std_var = ext.new_tensor('log_std',
                                     ndim=2,
                                     dtype=theano.config.floatX)

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################
        latent_var_sparse = self.policy.manager.dist_info_sym(
            obs_var_sparse)['mean']
        latent_var = TT.extra_ops.repeat(latent_var_sparse,
                                         self.period,
                                         axis=0)  #.dimshuffle(0, 'x')
        dist_info_var = self.policy.low_policy.dist_info_sym(
            obs_var, state_info_var=latent_var)
        old_dist_info_var = dict(mean=mean_var, log_std=log_std_var)
        skill_lr = self.diagonal.likelihood_ratio_sym(action_var,
                                                      old_dist_info_var,
                                                      dist_info_var)
        skill_surr_loss_vector = TT.minimum(
            skill_lr * advantage_var,
            TT.clip(skill_lr, 1 - self.epsilon, 1 + self.epsilon) *
            advantage_var)
        skill_surr_loss = -TT.mean(skill_surr_loss_vector)

        surr_loss = skill_surr_loss  # so that the relative magnitudes are correct

        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            input_list = [
                obs_var_raw, obs_var_sparse, action_var, advantage_var,
                mean_var, log_std_var
            ]

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        if not self.continuous_latent:
            advantage_sparse = input_values[2].reshape(
                [input_values[2].shape[0] // self.period, self.period])[:, 0]
            latents = input_values[3]['latents']
            latents_sparse = latents.take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)
            prob = np.array(list(input_values[3]['prob'].take(
                [i for i in range(0, latents.shape[0], self.period)], axis=0)),
                            dtype=np.float32)
        mean = input_values[3]['mean']
        log_std = input_values[3]['log_std']

        if self.use_skill_dependent_baseline:
            advantage_var = input_values[4]
        else:
            advantage_var = input_values[2]
        # import ipdb; ipdb.set_trace()
        if self.freeze_skills and not self.freeze_manager:
            raise NotImplementedError
        elif self.freeze_manager and not self.freeze_skills:
            raise NotImplementedError
        else:
            assert (not self.freeze_manager) or (not self.freeze_skills)
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                advantage_var, mean, log_std)

        # todo: assign current parameters to old policy; does this work?
        # old_param_values = self.policy.get_param_values(trainable=True)
        # self.old_policy.set_param_values(old_param_values, trainable=True)
        # old_param_values = self.policy.get_param_values()
        # self.old_policy.set_param_values(old_param_values)
        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
    def __init__(
        self,
        env_spec,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        min_std=1e-6,
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
    ):
        """
        :param env_spec:
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :return:
        """
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )
        self._mean_network = mean_network

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_var

        if adaptive_std:
            std_network = MLP(
                input_shape=(obs_dim, ),
                input_layer=mean_network.input_layer,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
            )
            l_log_std = std_network.output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        self.min_std = min_std

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(min_std))

        self._mean_var, self._log_std_var = mean_var, log_std_var

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
class PG_concurrent_approx(BatchPolopt, Serializable
                           ):  # todo: should this implement serializable?
    """
    Designed to enable concurrent training of a SNN that parameterizes skills
    and also train the manager at the same time

    Note that, if I'm not trying to do the sample approximation of the weird log of sum term,
    I don't need to know which skill was picked, just need to know the action
    """

    # double check this constructor later
    def __init__(
            self,
            optimizer=None,
            optimizer_args=None,
            step_size=1e-2,
            num_latents=6,
            latents=None,  # some sort of iterable of the actual latent vectors
            period=10,  # how often I choose a latent
            truncate_local_is_ratio=None,
            use_skill_dependent_baseline=False,
            **kwargs):
        Serializable.quick_init(self, locals())
        if optimizer is None:
            default_args = dict(batch_size=None, max_epochs=1)
            if optimizer_args is None:
                optimizer_args = default_args
            else:
                optimizer_args = dict(default_args, **optimizer_args)
            optimizer = FirstOrderOptimizer(learning_rate=step_size,
                                            **optimizer_args)
        self.optimizer = optimizer
        self.step_size = step_size
        self.truncate_local_is_ratio = truncate_local_is_ratio
        super(PG_concurrent_approx,
              self).__init__(**kwargs)  # not sure if this line is correct
        self.num_latents = kwargs['policy'].latent_dim
        self.latents = latents
        self.period = period

        # todo: fix this sampler stuff
        self.sampler = HierBatchSampler(self, self.period)

        # i hope this is right
        self.diagonal = DiagonalGaussian(
            self.policy.low_policy.action_space.flat_dim)
        self.debug_fns = []

        assert isinstance(self.policy, HierarchicalPolicy)
        if self.policy is not None:
            self.period = self.policy.period
        assert self.policy.period == self.period

        self.trainable_manager = self.policy.trainable_manager

        # skill dependent baseline
        self.use_skill_dependent_baseline = use_skill_dependent_baseline
        if use_skill_dependent_baseline:
            curr_env = kwargs['env']
            skill_dependent_action_space = curr_env.action_space
            skill_dependent_obs_space_dim = (
                (curr_env.observation_space.shape[0] + 1) * self.num_latents, )
            skill_dependent_obs_space = Box(
                -1.0, 1.0, shape=skill_dependent_obs_space_dim)
            skill_depdendent_env_spec = EnvSpec(skill_dependent_obs_space,
                                                skill_dependent_action_space)
            self.skill_dependent_baseline = LinearFeatureBaseline(
                env_spec=skill_depdendent_env_spec)

    # initialize the computation graph
    # optimize is run on >= 1 trajectory at a time

    def init_opt(self):
        # obs_var_raw = self.env.observation_space.new_tensor_variable(
        #     'obs',
        #     extra_dims=1,
        # )

        obs_var_raw = ext.new_tensor(
            'obs', ndim=3, dtype=theano.config.floatX)  # todo: check the dtype

        action_var = self.env.action_space.new_tensor_variable(
            'action',
            extra_dims=1,
        )

        # this will have to be the advantage every self.period timesteps
        advantage_var_sparse = ext.new_tensor('sparse_advantage',
                                              ndim=1,
                                              dtype=theano.config.floatX)

        advantage_var = ext.new_tensor('advantage',
                                       ndim=1,
                                       dtype=theano.config.floatX)

        obs_var_sparse = ext.new_tensor(
            'sparse_obs',
            ndim=2,
            dtype=theano.config.
            floatX  # todo: check this with carlos, refer to discrete.py in rllab.spaces
        )

        latent_var_sparse = ext.new_tensor('sparse_latent',
                                           ndim=2,
                                           dtype=theano.config.floatX)

        latent_var = ext.new_tensor('latents',
                                    ndim=2,
                                    dtype=theano.config.floatX)

        assert isinstance(self.policy, HierarchicalPolicy)

        # todo: assumptions: 1 trajectory, which is a multiple of p; that the obs_var_probs is valid

        # undoing the reshape, so that batch sampling is ok
        obs_var = TT.reshape(obs_var_raw, [
            obs_var_raw.shape[0] * obs_var_raw.shape[1], obs_var_raw.shape[2]
        ])
        # obs_var = obs_var_raw

        #############################################################
        ### calculating the manager portion of the surrogate loss ###
        #############################################################

        # i, j should contain the probability of latent j at time step self.period*i
        # should be a len(obs)//self.period by len(self.latent) tensor
        latent_probs = self.policy.manager.dist_info_sym(
            obs_var_sparse)['prob']
        actual_latent_probs = TT.sum(latent_probs * latent_var_sparse, axis=1)
        if self.trainable_manager:
            manager_surr_loss = -TT.mean(
                TT.log(actual_latent_probs) * advantage_var_sparse)
        else:
            manager_surr_loss = 0

        ############################################################
        ### calculating the skills portion of the surrogate loss ###
        ############################################################

        # get the distribution parameters
        # dist_info_vars = []
        # for latent in self.latents:
        #     self.policy.low_policy.set_latent_train(latent)
        #     dist_info_vars.append(self.policy.low_policy.dist_info_sym(obs_var))
        # hopefully the above line takes multiple samples, and state_info_vars not needed as input

        dist_info_vars = self.policy.low_policy.dist_info_sym_all_latents(
            obs_var)
        probs = TT.stack([
            self.diagonal.log_likelihood_sym(action_var, dist_info)
            for dist_info in dist_info_vars
        ],
                         axis=1)
        # todo: verify that dist_info_vars is in order

        actual_action_log_probs = TT.sum(probs * latent_var, axis=1)
        skill_surr_loss = -TT.mean(actual_action_log_probs * advantage_var)

        surr_loss = manager_surr_loss / self.period + skill_surr_loss  # so that the relative magnitudes are correct

        input_list = [
            obs_var_raw, obs_var_sparse, action_var, advantage_var,
            advantage_var_sparse, latent_var, latent_var_sparse
        ]
        # input_list = [obs_var_raw, obs_var_sparse, action_var, advantage_var]
        # npo has state_info_vars and old_dist_info_vars, I don't think I need them until I go for NPO/TRPO

        self.optimizer.update_opt(loss=surr_loss,
                                  target=self.policy,
                                  inputs=input_list)
        return dict()

    # do the optimization
    def optimize_policy(self, itr, samples_data):
        # import IPython; IPython.embed()
        print(len(samples_data['observations']), self.period)
        assert len(samples_data['observations']) % self.period == 0

        # note that I have to do extra preprocessing to the advantages, and also create obs_var_sparse

        if self.use_skill_dependent_baseline:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos", "skill_advantages"))
        else:
            input_values = tuple(
                ext.extract(samples_data, "observations", "actions",
                            "advantages", "agent_infos"))
        # print(input_values[0].shape)

        obs_raw = input_values[0].reshape(
            input_values[0].shape[0] // self.period, self.period,
            input_values[0].shape[1])
        # obs_raw = input_values[0]

        obs_sparse = input_values[0].take(
            [i for i in range(0, input_values[0].shape[0], self.period)],
            axis=0)
        advantage_sparse = input_values[2].reshape(
            [input_values[2].shape[0] // self.period, self.period])[:, 0]
        latents = input_values[3]['latents']
        latents_sparse = latents.take(
            [i for i in range(0, latents.shape[0], self.period)], axis=0)

        if self.use_skill_dependent_baseline:
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                input_values[4], advantage_sparse, latents,
                                latents_sparse)
        else:
            all_input_values = (obs_raw, obs_sparse, input_values[1],
                                input_values[2], advantage_sparse, latents,
                                latents_sparse)

        loss_before = self.optimizer.loss(all_input_values)
        self.optimizer.optimize(all_input_values)
        loss_after = self.optimizer.loss(all_input_values)
        logger.record_tabular('LossBefore', loss_before)
        logger.record_tabular('LossAfter', loss_after)
        logger.record_tabular('dLoss', loss_before - loss_after)
        return dict()

    def get_itr_snapshot(self, itr, samples_data):
        return dict(itr=itr,
                    policy=self.policy,
                    baseline=self.baseline,
                    env=self.env)

    def log_diagnostics(self, paths):
        # paths obtained by self.sampler.obtain_samples
        BatchPolopt.log_diagnostics(self, paths)
    def __init__(
        self,
        env_spec,
        env,
        pkl_paths=(),
        json_paths=(),
        npz_paths=(),
        trainable_old=True,
        external_selector=False,
        hidden_sizes_selector=(10, 10),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param pkl_paths: tuple/list of pkl paths
        :param json_paths: tuple/list of json paths
        :param npz_paths: tuple/list of npz paths
        :param trainable_old: Are the old policies still trainable
        :param external_selector: is the linear combination of the old policies outputs fixed externally
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        """
        # define where are the old policies to use and what to do with them:
        self.trainable_old = trainable_old  # whether to keep training the old policies loaded here
        self.pkl_paths = pkl_paths
        self.json_paths = json_paths
        self.npz_paths = npz_paths
        self.selector_dim = max(
            len(json_paths), len(pkl_paths))  # pkl could be zero if giving npz
        # if not use a selector NN here, just externally fixed selector variable:
        self.external_selector = external_selector  # whether to use the selectorNN defined here or the pre_fix_selector
        self.pre_fix_selector = np.zeros(
            (self.selector_dim)
        )  # if this is not empty when using reset() it will use this selector
        self.selector_fix = np.zeros(
            (self.selector_dim
             ))  # this will hold the selectors variable sampled in reset()
        self.shared_selector_var = theano.shared(
            self.selector_fix)  # this is for external selector! update that
        # else, describe the MLP used:
        self.hidden_sizes_selector = hidden_sizes_selector  # size of the selector NN defined here
        self.min_std = min_std
        self._set_std_to_0 = False

        self.action_dim = env_spec.action_space.flat_dim  # not checking that all the old policies have this act_dim

        self.old_hidden_sizes = []
        # assume json always given
        for json_path in self.json_paths:
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, json_path), 'r'))
            old_json_policy = data['json_args']["policy"]
            self.old_hidden_sizes.append(old_json_policy['hidden_sizes'])

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.external_selector:  # in case we want to fix the selector externally
            l_all_obs_var = L.InputLayer(
                shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, ))
            all_obs_var = l_all_obs_var.input_var
            l_selection = ParamLayer(incoming=l_all_obs_var,
                                     num_units=self.selector_dim,
                                     param=self.shared_selector_var,
                                     trainable=False)
            selection_var = L.get_output(l_selection)
        else:
            # create network with softmax output: it will be the selector!
            selector_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim, ),
                output_dim=self.selector_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = selector_network.input_layer
            all_obs_var = selector_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to selectors)
            l_selection = selector_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var,
                                start_index=None,
                                end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var,
                               start_index=self.obs_robot_dim,
                               end_index=None)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # create the action networks
        self.old_l_means = [
        ]  # I do this self in case I wanna access it from reset
        self.old_l_log_stds = []
        self.old_layers = []
        for i in range(self.selector_dim):
            mean_network = MLP(
                input_layer=l_obs_robot,
                output_dim=self.action_dim,
                hidden_sizes=self.old_hidden_sizes[i],
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="meanMLP{}".format(i),
            )
            self.old_l_means.append(mean_network.output_layer)
            self.old_layers += mean_network.layers

            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=self.action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std{}".format(i),
                trainable=learn_std,
            )
            self.old_l_log_stds.append(l_log_std)
            self.old_layers += [l_log_std]

        if not self.trainable_old:
            for layer in self.old_layers:
                for param, tags in layer.params.items(
                ):  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_paths and self.npz_paths:
            old_params_dict = {}
            for i, npz_path in enumerate(self.npz_paths):
                params_dict = dict(
                    np.load(os.path.join(config.PROJECT_PATH, npz_path)))
                renamed_warm_params_dict = {}
                for key in params_dict.keys():
                    if key == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = params_dict[key]
                    elif 'meanMLP_' == key[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key[8:]] = params_dict[key]
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key] = params_dict[key]
            self.set_old_params(old_params_dict)

        elif self.pkl_paths:
            old_params_dict = {}
            for i, pkl_path in enumerate(self.pkl_paths):
                data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
                params = data['policy'].get_params_internal()
                for param in params:
                    if param.name == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = param.get_value()
                    elif 'meanMLP_' == param.name[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name[8:]] = param.get_value()
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name] = param.get_value()
            self.set_old_params(old_params_dict)

        # new layers actually selecting the correct output
        l_mean = SumProdLayer(self.old_l_means + [l_selection])
        l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection])
        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(self.action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec)

        self._f_old_means = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ])

        self._f_all_inputs = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ] + [selection_var])

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )
class GaussianMLPPolicy_multi_hier(StochasticPolicy, LasagnePowered,
                                   Serializable
                                   ):  # also inherits form Parametrized
    """
    Policy that joins several pre-trained policies and performs a linear combination of their output.
    If a selector is provided, the coeficients of the LC are externally given. Otherwise it's a MLP, in which case it
    can be trained end-to-end.
    """
    def __init__(
        self,
        env_spec,
        env,
        pkl_paths=(),
        json_paths=(),
        npz_paths=(),
        trainable_old=True,
        external_selector=False,
        hidden_sizes_selector=(10, 10),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param pkl_paths: tuple/list of pkl paths
        :param json_paths: tuple/list of json paths
        :param npz_paths: tuple/list of npz paths
        :param trainable_old: Are the old policies still trainable
        :param external_selector: is the linear combination of the old policies outputs fixed externally
        :param hidden_sizes: list of sizes for the fully-connected hidden layers
        :param learn_std: Is std trainable
        :param init_std: Initial std
        :param adaptive_std:
        :param std_share_network:
        :param std_hidden_sizes: list of sizes for the fully-connected layers for std
        :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues
        :param std_hidden_nonlinearity:
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :param output_nonlinearity: nonlinearity for the output layer
        :param mean_network: custom network for the output mean
        :param std_network: custom network for the output log std
        """
        # define where are the old policies to use and what to do with them:
        self.trainable_old = trainable_old  # whether to keep training the old policies loaded here
        self.pkl_paths = pkl_paths
        self.json_paths = json_paths
        self.npz_paths = npz_paths
        self.selector_dim = max(
            len(json_paths), len(pkl_paths))  # pkl could be zero if giving npz
        # if not use a selector NN here, just externally fixed selector variable:
        self.external_selector = external_selector  # whether to use the selectorNN defined here or the pre_fix_selector
        self.pre_fix_selector = np.zeros(
            (self.selector_dim)
        )  # if this is not empty when using reset() it will use this selector
        self.selector_fix = np.zeros(
            (self.selector_dim
             ))  # this will hold the selectors variable sampled in reset()
        self.shared_selector_var = theano.shared(
            self.selector_fix)  # this is for external selector! update that
        # else, describe the MLP used:
        self.hidden_sizes_selector = hidden_sizes_selector  # size of the selector NN defined here
        self.min_std = min_std
        self._set_std_to_0 = False

        self.action_dim = env_spec.action_space.flat_dim  # not checking that all the old policies have this act_dim

        self.old_hidden_sizes = []
        # assume json always given
        for json_path in self.json_paths:
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, json_path), 'r'))
            old_json_policy = data['json_args']["policy"]
            self.old_hidden_sizes.append(old_json_policy['hidden_sizes'])

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.external_selector:  # in case we want to fix the selector externally
            l_all_obs_var = L.InputLayer(
                shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, ))
            all_obs_var = l_all_obs_var.input_var
            l_selection = ParamLayer(incoming=l_all_obs_var,
                                     num_units=self.selector_dim,
                                     param=self.shared_selector_var,
                                     trainable=False)
            selection_var = L.get_output(l_selection)
        else:
            # create network with softmax output: it will be the selector!
            selector_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim, ),
                output_dim=self.selector_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = selector_network.input_layer
            all_obs_var = selector_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to selectors)
            l_selection = selector_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var,
                                start_index=None,
                                end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var,
                               start_index=self.obs_robot_dim,
                               end_index=None)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # create the action networks
        self.old_l_means = [
        ]  # I do this self in case I wanna access it from reset
        self.old_l_log_stds = []
        self.old_layers = []
        for i in range(self.selector_dim):
            mean_network = MLP(
                input_layer=l_obs_robot,
                output_dim=self.action_dim,
                hidden_sizes=self.old_hidden_sizes[i],
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
                name="meanMLP{}".format(i),
            )
            self.old_l_means.append(mean_network.output_layer)
            self.old_layers += mean_network.layers

            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=self.action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std{}".format(i),
                trainable=learn_std,
            )
            self.old_l_log_stds.append(l_log_std)
            self.old_layers += [l_log_std]

        if not self.trainable_old:
            for layer in self.old_layers:
                for param, tags in layer.params.items(
                ):  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_paths and self.npz_paths:
            old_params_dict = {}
            for i, npz_path in enumerate(self.npz_paths):
                params_dict = dict(
                    np.load(os.path.join(config.PROJECT_PATH, npz_path)))
                renamed_warm_params_dict = {}
                for key in params_dict.keys():
                    if key == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = params_dict[key]
                    elif 'meanMLP_' == key[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key[8:]] = params_dict[key]
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        key] = params_dict[key]
            self.set_old_params(old_params_dict)

        elif self.pkl_paths:
            old_params_dict = {}
            for i, pkl_path in enumerate(self.pkl_paths):
                data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path))
                params = data['policy'].get_params_internal()
                for param in params:
                    if param.name == 'output_log_std.param':
                        old_params_dict['output_log_std{}.param'.format(
                            i)] = param.get_value()
                    elif 'meanMLP_' == param.name[:8]:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name[8:]] = param.get_value()
                    else:
                        old_params_dict['meanMLP{}_'.format(i) +
                                        param.name] = param.get_value()
            self.set_old_params(old_params_dict)

        # new layers actually selecting the correct output
        l_mean = SumProdLayer(self.old_l_means + [l_selection])
        l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection])
        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(self.action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec)

        self._f_old_means = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ])

        self._f_all_inputs = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[
                L.get_output(l_old_mean) for l_old_mean in self.old_l_means
            ] + [selection_var])

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )

    def get_old_params(self):
        params = []
        for layer in self.old_layers:
            params += layer.get_params()
        return params

    # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names
    def set_old_params(self, old_params):
        if type(
                old_params
        ) is dict:  # if the old_params are a dict with the param name as key and a numpy array as value
            params_value_by_name = old_params
        elif type(
                old_params
        ) is list:  # if the old_params are a list of theano variables
            params_value_by_name = {}
            for param in old_params:
                params_value_by_name[param.name] = param.get_value()
        else:
            params_value_by_name = {}
            print("The old_params was not understood!")

        local_params = self.get_old_params()
        for param in local_params:
            param.set_value(params_value_by_name[param.name])

    def dist_info_sym(self, obs_var, state_info_var=None):
        mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std],
                                             obs_var)
        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))
        return dict(mean=mean_var, log_std=log_std_var)

    @overrides
    def get_action(self, observation):
        actions, outputs = self.get_actions([observation])
        return actions[0], {k: v[0] for k, v in outputs.items()}

    def get_actions(self, observations):
        selector_output = self._f_select(observations)
        mean, log_std = self._f_dist(observations)

        if self._set_std_to_0:
            actions = mean
            log_std = -1e6 * np.ones_like(log_std)
        else:
            rnd = np.random.normal(size=mean.shape)
            actions = rnd * np.exp(log_std) + mean
        return actions, dict(mean=mean,
                             log_std=log_std,
                             selectors=selector_output)

    def set_pre_fix_selector(self, selector):
        self.pre_fix_selector = np.array(selector)

    def unset_pre_fix_selector(self):
        self.pre_fix_selector = np.array([])

    @contextmanager
    def fix_selector(self, selector):
        self.pre_fix_selector = np.array(selector)
        yield
        self.pre_fix_selector = np.array([])

    @contextmanager
    def set_std_to_0(self):
        self._set_std_to_0 = True
        yield
        self._set_std_to_0 = False

    @overrides
    def reset(
        self
    ):  # executed at the start of every rollout. Will fix the selector if needed.
        if self.pre_fix_selector.size > 0:
            self.selector_fix = self.pre_fix_selector
        # this is needed for the external selector
        self.shared_selector_var.set_value(np.array(self.selector_fix))

    def log_diagnostics(self, paths):
        log_stds = np.vstack(
            [path["agent_infos"]["log_std"] for path in paths])
        logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds)))
        logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds)))
        logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))

    @property
    def distribution(self):
        return self._dist

    def log_likelihood(self, actions, agent_infos, action_only=True):
        # First compute logli of the action. This assumes the selectors FIX to whatever was sampled, and hence we only
        # need to use the mean and log_std, but not any information about the selectors
        logli = self._dist.log_likelihood(actions, agent_infos)
        if not action_only:
            raise NotImplementedError
            #   if not action_only:
            #       for idx, selector_name in enumerate(self._selector_distributions):
            #           selector_var = dist_info["selector_%d" % idx]
            #           prefix = "selector_%d_" % idx
            #           selector_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith(
            #               prefix)}
            #           logli += selector_name.log_likelihood(selector_var, selector_dist_info)
        return logli
Beispiel #16
0
    def __init__(
        self,
        env_spec,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.bilinear_integration:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim +\
                      env_spec.observation_space.flat_dim * latent_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            l_log_std = MLP(input_shape=(obs_dim, ),
                            input_var=obs_var,
                            output_dim=action_dim,
                            hidden_sizes=std_hidden_sizes,
                            hidden_nonlinearity=std_hidden_nonlinearity,
                            output_nonlinearity=None,
                            name="log_stdMLP").output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )
Beispiel #17
0
    def __init__(
        self,
        env_spec,
        input_latent_vars=None,
        hidden_sizes=(32, 32),
        hidden_latent_vars=None,
        learn_std=True,
        init_std=1.0,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
    ):
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        mean_network = StochasticMLP(
            input_shape=(obs_dim, ),
            input_latent_vars=input_latent_vars,
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_latent_vars=hidden_latent_vars,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._n_latent_layers = len(mean_network.latent_layers)
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(StochasticGaussianMLPPolicy, self).__init__(env_spec)

        outputs = self.dist_info_sym(mean_network.input_var)
        latent_keys = sorted(
            set(outputs.keys()).difference({"mean", "log_std"}))

        extras = get_full_output([self._l_mean, self._l_log_std] +
                                 self._mean_network.latent_layers, )[1]
        latent_distributions = [
            extras[layer]["distribution"]
            for layer in self._mean_network.latent_layers
        ]

        self._latent_keys = latent_keys
        self._latent_distributions = latent_distributions
        self._dist = DiagonalGaussian(action_dim)

        self._f_dist_info = ext.compile_function(
            inputs=[obs_var],
            outputs=outputs,
        )
        self._f_dist_info_givens = None
Beispiel #18
0
class StochasticGaussianMLPPolicy(StochasticPolicy, LasagnePowered,
                                  Serializable):
    def __init__(
        self,
        env_spec,
        input_latent_vars=None,
        hidden_sizes=(32, 32),
        hidden_latent_vars=None,
        learn_std=True,
        init_std=1.0,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
    ):
        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        # create network
        mean_network = StochasticMLP(
            input_shape=(obs_dim, ),
            input_latent_vars=input_latent_vars,
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_latent_vars=hidden_latent_vars,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        l_log_std = ParamLayer(
            mean_network.input_layer,
            num_units=action_dim,
            param=lasagne.init.Constant(np.log(init_std)),
            name="output_log_std",
            trainable=learn_std,
        )

        self._mean_network = mean_network
        self._n_latent_layers = len(mean_network.latent_layers)
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(StochasticGaussianMLPPolicy, self).__init__(env_spec)

        outputs = self.dist_info_sym(mean_network.input_var)
        latent_keys = sorted(
            set(outputs.keys()).difference({"mean", "log_std"}))

        extras = get_full_output([self._l_mean, self._l_log_std] +
                                 self._mean_network.latent_layers, )[1]
        latent_distributions = [
            extras[layer]["distribution"]
            for layer in self._mean_network.latent_layers
        ]

        self._latent_keys = latent_keys
        self._latent_distributions = latent_distributions
        self._dist = DiagonalGaussian(action_dim)

        self._f_dist_info = ext.compile_function(
            inputs=[obs_var],
            outputs=outputs,
        )
        self._f_dist_info_givens = None

    @property
    def latent_layers(self):
        return self._mean_network.latent_layers

    @property
    def latent_dims(self):
        return self._mean_network.latent_dims

    def dist_info(self, obs, state_infos=None):
        if state_infos is None or len(state_infos) == 0:
            return self._f_dist_info(obs)
        if self._f_dist_info_givens is None:
            # compile function
            obs_var = self._mean_network.input_var
            latent_keys = [
                "latent_%d" % idx for idx in range(self._n_latent_layers)
            ]
            latent_vars = [
                TT.matrix("latent_%d" % idx)
                for idx in range(self._n_latent_layers)
            ]
            latent_dict = dict(list(zip(latent_keys, latent_vars)))
            self._f_dist_info_givens = ext.compile_function(
                inputs=[obs_var] + latent_vars,
                outputs=self.dist_info_sym(obs_var, latent_dict),
            )
        latent_vals = []
        for idx in range(self._n_latent_layers):
            latent_vals.append(state_infos["latent_%d" % idx])
        return self._f_dist_info_givens(*[obs] + latent_vals)

    def reset(self):  #here I would sample a latents var.
        # sample latents
        # store it in self.something that then goes to all the others
        pass

    def dist_info_sym(self, obs_var, state_info_vars=None):
        if state_info_vars is not None:
            latent_givens = {
                latent_layer: state_info_vars["latent_%d" % idx]
                for idx, latent_layer in enumerate(
                    self._mean_network.latent_layers)
            }
            latent_dist_infos = dict()
            for idx, latent_layer in enumerate(
                    self._mean_network.latent_layers):
                cur_dist_info = dict()
                prefix = "latent_%d_" % idx
                for k, v in state_info_vars.items():
                    if k.startswith(prefix):
                        cur_dist_info[k[len(prefix):]] = v
                latent_dist_infos[latent_layer] = cur_dist_info
        else:
            latent_givens = dict()
            latent_dist_infos = dict()
        all_outputs, extras = get_full_output(
            [self._l_mean, self._l_log_std] + self._mean_network.latent_layers,
            inputs={self._mean_network._l_in: obs_var},
            latent_givens=latent_givens,
            latent_dist_infos=latent_dist_infos,
        )

        mean_var = all_outputs[0]
        log_std_var = all_outputs[1]
        latent_vars = all_outputs[2:]
        latent_dist_infos = []
        for latent_layer in self._mean_network.latent_layers:
            latent_dist_infos.append(extras[latent_layer]["dist_info"])

        output_dict = dict(mean=mean_var, log_std=log_std_var)
        for idx, latent_var, latent_dist_info in zip(itertools.count(),
                                                     latent_vars,
                                                     latent_dist_infos):
            output_dict["latent_%d" % idx] = latent_var
            for k, v in latent_dist_info.items():
                output_dict["latent_%d_%s" % (idx, k)] = v

        return output_dict

    def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
        """
        Compute the symbolic KL divergence of distributions of both the actions and the latents variables
        """
        kl = self._dist.kl_sym(old_dist_info_vars, new_dist_info_vars)
        for idx, latent_dist in enumerate(self._latent_distributions):
            # collect dist info for each latents variable
            prefix = "latent_%d_" % idx
            old_latent_dist_info = {
                k[len(prefix):]: v
                for k, v in old_dist_info_vars.items() if k.startswith(prefix)
            }
            new_latent_dist_info = {
                k[len(prefix):]: v
                for k, v in new_dist_info_vars.items() if k.startswith(prefix)
            }
            kl += latent_dist.kl_sym(old_latent_dist_info,
                                     new_latent_dist_info)
        return kl

    def likelihood_ratio_sym(self, action_var, old_dist_info_vars,
                             new_dist_info_vars):
        """
        Compute the symbolic likelihood ratio of both the actions and the latents variables.
        """
        lr = self._dist.likelihood_ratio_sym(action_var, old_dist_info_vars,
                                             new_dist_info_vars)
        for idx, latent_dist in enumerate(self._latent_distributions):
            latent_var = old_dist_info_vars["latent_%d" % idx]
            prefix = "latent_%d_" % idx
            old_latent_dist_info = {
                k[len(prefix):]: v
                for k, v in old_dist_info_vars.items() if k.startswith(prefix)
            }
            new_latent_dist_info = {
                k[len(prefix):]: v
                for k, v in new_dist_info_vars.items() if k.startswith(prefix)
            }
            lr *= latent_dist.likelihood_ratio_sym(latent_var,
                                                   old_latent_dist_info,
                                                   new_latent_dist_info)
        return lr

    def log_likelihood(self, actions, dist_info, action_only=False):
        """
        Computes the log likelihood of both the actions and the latents variables, unless action_only is set to True,
        in which case it will only compute the log likelihood of the actions.
        :return:
        """
        logli = self._dist.log_likelihood(actions, dist_info)
        if not action_only:
            for idx, latent_dist in enumerate(self._latent_distributions):
                latent_var = dist_info["latent_%d" % idx]
                prefix = "latent_%d_" % idx
                latent_dist_info = {
                    k[len(prefix):]: v
                    for k, v in dist_info.items() if k.startswith(prefix)
                }
                logli += latent_dist.log_likelihood(latent_var,
                                                    latent_dist_info)
        return logli

    def log_likelihood_sym(self, action_var, dist_info_vars):
        logli = self._dist.log_likelihood_sym(action_var, dist_info_vars)
        for idx, latent_dist in enumerate(self._latent_distributions):
            latent_var = dist_info_vars["latent_%d" % idx]
            prefix = "latent_%d_" % idx
            latent_dist_info = {
                k[len(prefix):]: v
                for k, v in dist_info_vars.items() if k.startswith(prefix)
            }
            logli += latent_dist.log_likelihood_sym(latent_var,
                                                    latent_dist_info)
        return logli

    def entropy(self, dist_info):
        ent = self._dist.entropy(dist_info)
        for idx, latent_dist in enumerate(self._latent_distributions):
            prefix = "latent_%d_" % idx
            latent_dist_info = {
                k[len(prefix):]: v
                for k, v in dist_info.items() if k.startswith(prefix)
            }
            ent += latent_dist.entropy(latent_dist_info)
        return ent

    @property
    def dist_info_keys(self):
        return ["mean", "log_std"] + self._latent_keys

    @overrides
    def get_action(self, observation):
        actions, outputs = self.get_actions([observation])
        return actions[0], {k: v[0] for k, v in outputs.items()}

    def get_actions(self, observations):
        outputs = self._f_dist_info(observations)
        mean = outputs["mean"]
        log_std = outputs["log_std"]
        rnd = np.random.normal(size=mean.shape)
        actions = rnd * np.exp(log_std) + mean
        return actions, outputs

    def log_diagnostics(self, paths):
        log_stds = np.vstack(
            [path["agent_infos"]["log_std"] for path in paths])
        logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds)))

    @property
    def distribution(self):
        """
        We set the distribution to the policy itself since we need some behavior different from a usual diagonal
        Gaussian distribution.
        """
        return self

    @property
    def state_info_keys(self):
        return self._latent_keys
Beispiel #19
0
    def __init__(
        self,
        input_shape,
        output_dim,
        mean_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_nonlinearity=None,
        normalize_inputs=True,
        normalize_outputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self._optimizer = optimizer

        if mean_network is None:
            mean_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = MLP(
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))
        y_mean_var = theano.shared(np.zeros((1, output_dim)),
                                   name="y_mean",
                                   broadcastable=(True, False))
        y_std_var = theano.shared(np.ones((1, output_dim)),
                                  name="y_std",
                                  broadcastable=(True, False))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian()

        normalized_dist_info_vars = dict(mean=normalized_means_var,
                                         log_std=normalized_log_stds_var)

        mean_kl = TT.mean(
            dist.kl_sym(
                dict(mean=normalized_old_means_var,
                     log_std=normalized_old_log_stds_var),
                normalized_dist_info_vars,
            ))

        loss = -TT.mean(
            dist.log_likelihood_sym(normalized_ys_var,
                                    normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var
            ]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
    def __init__(
            self,
            env_spec,
            env,
            pkl_path=None,
            json_path=None,
            npz_path=None,
            trainable_snn=True,
            ##CF - latents units at the input
            latent_dim=3,  # we keep all these as the dim of the output of the other MLP and others that we will need!
            latent_name='categorical',
            bilinear_integration=False,  # again, needs to match!
            resample=False,  # this can change: frequency of resampling the latent?
            hidden_sizes_snn=(32, 32),
            hidden_sizes_selector=(10, 10),
            external_latent=False,
            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_hidden_sizes=(32, 32),
            std_hidden_nonlinearity=NL.tanh,
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=None,
            min_std=1e-4,
    ):
        self.latent_dim = latent_dim  ## could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes_snn = hidden_sizes_snn
        self.hidden_sizes_selector = hidden_sizes_selector

        self.pre_fix_latent = np.array([])  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array([])  # this will hold the latents variable sampled in reset()
        self.shared_latent_var = theano.shared(self.latent_fix)  # this is for external lat! update that
        self._set_std_to_0 = False

        self.trainable_snn = trainable_snn
        self.external_latent = external_latent
        self.pkl_path = pkl_path
        self.json_path = json_path
        self.npz_path = npz_path
        self.old_policy = None

        if self.json_path:  # there is another one after defining all the NN to warm-start the params of the SNN
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, self.json_path), 'r'))  # I should do this with the json file
            self.old_policy_json = data['json_args']["policy"]
            self.latent_dim = self.old_policy_json['latent_dim']
            self.latent_name = self.old_policy_json['latent_name']
            self.bilinear_integration = self.old_policy_json['bilinear_integration']
            self.resample = self.old_policy_json['resample']  # this could not be needed...
            self.min_std = self.old_policy_json['min_std']
            self.hidden_sizes_snn = self.old_policy_json['hidden_sizes']
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if self.latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim))
        elif self.latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif self.latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))  # this is an empty array
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.external_latent:  # in case we want to fix the latent externally
            l_all_obs_var = L.InputLayer(shape=(None,) + (self.obs_robot_dim + self.obs_maze_dim,))
            all_obs_var = l_all_obs_var.input_var
            # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var)
            l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var,
                                     trainable=False) # Rui: change False to True? this is a simple layer that directly outputs self.shared_latent_var
            selection_var = L.get_output(l_selection)

        else:
            # create network with softmax output: it will be the latent 'selector'!
            latent_selection_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim,),
                output_dim=self.latent_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = latent_selection_network.input_layer
            all_obs_var = latent_selection_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to latents)
            l_selection = latent_selection_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None)
        # for _ in range(10):
        #     print("OK!")
        # print(self.obs_robot_dim)
        # print(self.obs_maze_dim)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # Enlarge obs with the selectors (or latents). Here just computing the final input dim
        if self.bilinear_integration:
            l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection])
        else:
            l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection])

        action_dim = env_spec.action_space.flat_dim

        # create the action network
        mean_network = MLP(
            input_layer=l_obs_snn,  # input the layer that handles the integration of the selector
            output_dim=action_dim,
            hidden_sizes=self.hidden_sizes_snn,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer

        if adaptive_std:
            log_std_network = MLP(
                input_layer=l_obs_snn,
                output_dim=action_dim,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_hidden_nonlinearity,
                output_nonlinearity=None,
                name="log_stdMLP"
            )
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if not self.trainable_snn:
            for layer in self._layers_snn:
                for param, tags in layer.params.items():  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_path and self.npz_path:
            warm_params_dict = dict(np.load(os.path.join(config.PROJECT_PATH, self.npz_path)))
            # keys = list(param_dict.keys())
            self.set_params_snn(warm_params_dict)
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec)

        # debug
        obs_snn_var = L.get_output(l_obs_snn)
        self._l_obs_snn = ext.compile_function(
            inputs=[all_obs_var],
            outputs=obs_snn_var,
        )
        # self._log_std = ext.compile_function(
        #     inputs=[all_obs_var],
        #     outputs=log_std_var,
        # )
        self._mean = ext.compile_function(
            inputs=[all_obs_var],
            outputs=mean_var,
        )

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )