class GaussianMLPPolicy_snn_restorable(StochasticPolicy, LasagnePowered, Serializable): """ This stochastic policy allows to pick the latent distribution (Categorical in the paper), its dimension and its integration with the observations. """ @autoargs.arg('hidden_sizes', type=int, nargs='*', help='list of sizes for the fully-connected hidden layers') @autoargs.arg( 'std_sizes', type=int, nargs='*', help='list of sizes for the fully-connected layers for std, note' 'there is a difference in semantics than above: here an empty' 'list means that std is independent of input and the last size is ignored' ) @autoargs.arg('initial_std', type=float, help='Initial std') @autoargs.arg('std_trainable', type=bool, help='Is std trainable') @autoargs.arg('output_nl', type=str, help='nonlinearity for the output layer') @autoargs.arg('nonlinearity', type=str, help='nonlinearity used for each hidden layer, can be one ' 'of tanh, sigmoid') @autoargs.arg('bn', type=bool, help='whether to apply batch normalization to hidden layers') def __init__( self, env_spec, env, latent_dim=2, latent_name='bernoulli', bilinear_integration=False, resample=False, hidden_sizes=(32, 32), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, pkl_path=None, ): """ :param latent_dim: dimension of the latent variables :param latent_name: distribution of the latent variables :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever agent is reset, which can happen several times along the rollout with rollout in utils_snn) """ self.latent_dim = latent_dim ##could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes = hidden_sizes self.pre_fix_latent = np.array( [] ) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array( []) # this will hold the latents variable sampled in reset() self._set_std_to_0 = False self.pkl_path = pkl_path if self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) self.old_policy = data["policy"] self.latent_dim = self.old_policy.latent_dim self.latent_name = self.old_policy.latent_name self.bilinear_integration = self.old_policy.bilinear_integration self.resample = self.old_policy.resample # this could not be needed... self.min_std = self.old_policy.min_std self.hidden_sizes_snn = self.old_policy.hidden_sizes if latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) # retrieve dimensions from env! if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim if self.bilinear_integration: obs_dim = self.obs_robot_dim + self.latent_dim +\ self.obs_robot_dim * self.latent_dim else: obs_dim = self.obs_robot_dim + self.latent_dim # here only if concat. action_dim = env_spec.action_space.flat_dim # for _ in range(10): # print("OK!") # print(obs_dim) # print(env_spec.observation_space.flat_dim) # print(self.latent_dim) mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) self._layers_mean = mean_network.layers l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var if adaptive_std: log_std_network = MLP(input_shape=(obs_dim, ), input_var=obs_var, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP") l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers if self.pkl_path: # restore from pkl file data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) warm_params = data['policy'].get_params_internal() self.set_params_snn(warm_params) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn_restorable, self).__init__(env_spec) self._f_dist = ext.compile_function( inputs=[obs_var], outputs=[mean_var, log_std_var], ) # # this is currently not used, although it could, in dist_info_sym and in get_actions. Also we could refactor all.. # # this would actually be WRONG with the current obs_var definition # latent_var = Box(low=-np.inf, high=np.inf, shape=(1,)).new_tensor_variable('latents', extra_dims=1) # # extended_obs_var = TT.concatenate([obs_var, latent_var, # TT.flatten(obs_var[:, :, np.newaxis] * latent_var[:, np.newaxis, :], # outdim=2)] # , axis=1) # self._extended_obs_var = ext.compile_function( # inputs=[obs_var, latent_var], # outputs=[extended_obs_var] # ) @property def latent_space(self): return Box(low=-np.inf, high=np.inf, shape=(1, )) # the mean and var now also depend on the particular latents sampled def dist_info_sym(self, obs_var, latent_var=None): # this is ment to be for one path! # now this is not doing anything! And for computing the dist_info_vars of npo_snn_rewardMI it doesn't work # for _ in range(10): # print("OK") # print(obs_var) # obs_var = [obs_var[i][:self.obs_robot_dim] for i in range(obs_var.shape[0])] # trim the observations if latent_var is None: latent_var1 = theano.shared( np.expand_dims(self.latent_fix, axis=0) ) # new fix to avoid putting the latent as an input: just take the one fixed! latent_var = TT.tile(latent_var1, [obs_var.shape[0], 1]) # generate the generalized input (append latents to obs.) if self.bilinear_integration: extended_obs_var = TT.concatenate([ obs_var, latent_var, TT.flatten( obs_var[:, :, np.newaxis] * latent_var[:, np.newaxis, :], outdim=2) ], axis=1) else: extended_obs_var = TT.concatenate([obs_var, latent_var], axis=1) mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], extended_obs_var) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) return dict(mean=mean_var, log_std=log_std_var) @overrides def get_action(self, observation): actions, outputs = self.get_actions([observation]) return actions[0], {k: v[0] for k, v in outputs.items()} def get_actions(self, observations): # observations: [ndarray] observations = [observations[0][:self.obs_robot_dim]] observations = np.array( observations) # needed to do the outer product for the bilinear # print(observations) if self.latent_dim: if self.resample: latents = [ self.latent_dist.sample(self.latent_dist_info) for _ in observations ] print('resampling the latents') else: if not np.size( self.latent_fix ) == self.latent_dim: # we decide to reset based on if smthing in the fix self.reset() if len( self.pre_fix_latent ) == self.latent_dim: # If we have a pre_fix, reset will put the latent to it self.reset( ) # this overwrites the latent sampled or in latent_fix latents = np.tile(self.latent_fix, [len(observations), 1 ]) # maybe a broadcast operation better... if self.bilinear_integration: extended_obs = np.concatenate([ observations, latents, np.reshape( observations[:, :, np.newaxis] * latents[:, np.newaxis, :], (observations.shape[0], -1)) ], axis=1) # print("obs:", observations.shape) # 1*47 # print("latents:", latents.shape) # 1*6 # print("extended obs:", extended_obs.shape) # 1*335 else: extended_obs = np.concatenate([observations, latents], axis=1) else: latents = np.array([[]] * len(observations)) extended_obs = observations # make mean, log_std also depend on the latents (as observ.) mean, log_std = self._f_dist(extended_obs) # print("log_std", log_std) if self._set_std_to_0: actions = mean log_std = -1e6 * np.ones_like(log_std) else: rnd = np.random.normal(size=mean.shape) actions = rnd * np.exp(log_std) + mean return actions, dict(mean=mean, log_std=log_std, latents=latents) def get_params_snn(self): params = [] for layer in self._layers_snn: params += layer.get_params() return params # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names def set_params_snn(self, snn_params): if type( snn_params ) is dict: # if the snn_params are a dict with the param name as key and a numpy array as value params_value_by_name = snn_params elif type( snn_params ) is list: # if the snn_params are a list of theano variables **NOT CHECKING THIS!!** params_value_by_name = {} for param in snn_params: # print("old", param.name) params_value_by_name[param.name] = param.get_value() else: params_value_by_name = {} print("The snn_params was not understood!") local_params = self.get_params_snn() for param in local_params: # print("new", param.name) param.set_value(params_value_by_name[param.name]) def set_pre_fix_latent(self, latent): self.pre_fix_latent = np.array(latent) def unset_pre_fix_latent(self): self.pre_fix_latent = np.array([]) @contextmanager def fix_latent(self, latent): self.pre_fix_latent = np.array(latent) yield self.pre_fix_latent = np.array([]) @contextmanager def set_std_to_0(self): self._set_std_to_0 = True yield self._set_std_to_0 = False @overrides def reset( self, force_resample_lat=False ): # executed at the start of every rollout. Will fix the latent if needed. if not self.resample and self.latent_dim: if self.pre_fix_latent.size > 0 and not force_resample_lat: self.latent_fix = self.pre_fix_latent else: self.latent_fix = self.latent_dist.sample( self.latent_dist_info) else: pass def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds))) logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds))) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds))) @property def distribution(self): return self._dist def log_likelihood(self, actions, agent_infos, action_only=True): # First compute logli of the action. This assumes the latents FIX to whatever was sampled, and hence we only # need to use the mean and log_std, but not any information about the latents logli = self._dist.log_likelihood(actions, agent_infos) if not action_only: raise NotImplementedError # if not action_only: # for idx, latent_name in enumerate(self._latent_distributions): # latent_var = dist_info["latent_%d" % idx] # prefix = "latent_%d_" % idx # latent_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith( # prefix)} # logli += latent_name.log_likelihood(latent_var, latent_dist_info) return logli
class GaussianMLPPolicy_multi_hier(StochasticPolicy, LasagnePowered, Serializable ): # also inherits form Parametrized """ Policy that joins several pre-trained policies and performs a linear combination of their output. If a selector is provided, the coeficients of the LC are externally given. Otherwise it's a MLP, in which case it can be trained end-to-end. """ def __init__( self, env_spec, env, pkl_paths=(), json_paths=(), npz_paths=(), trainable_old=True, external_selector=False, hidden_sizes_selector=(10, 10), learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): """ :param pkl_paths: tuple/list of pkl paths :param json_paths: tuple/list of json paths :param npz_paths: tuple/list of npz paths :param trainable_old: Are the old policies still trainable :param external_selector: is the linear combination of the old policies outputs fixed externally :param hidden_sizes: list of sizes for the fully-connected hidden layers :param learn_std: Is std trainable :param init_std: Initial std :param adaptive_std: :param std_share_network: :param std_hidden_sizes: list of sizes for the fully-connected layers for std :param min_std: whether to make sure that the std is at least some threshold value, to avoid numerical issues :param std_hidden_nonlinearity: :param hidden_nonlinearity: nonlinearity used for each hidden layer :param output_nonlinearity: nonlinearity for the output layer :param mean_network: custom network for the output mean :param std_network: custom network for the output log std """ # define where are the old policies to use and what to do with them: self.trainable_old = trainable_old # whether to keep training the old policies loaded here self.pkl_paths = pkl_paths self.json_paths = json_paths self.npz_paths = npz_paths self.selector_dim = max( len(json_paths), len(pkl_paths)) # pkl could be zero if giving npz # if not use a selector NN here, just externally fixed selector variable: self.external_selector = external_selector # whether to use the selectorNN defined here or the pre_fix_selector self.pre_fix_selector = np.zeros( (self.selector_dim) ) # if this is not empty when using reset() it will use this selector self.selector_fix = np.zeros( (self.selector_dim )) # this will hold the selectors variable sampled in reset() self.shared_selector_var = theano.shared( self.selector_fix) # this is for external selector! update that # else, describe the MLP used: self.hidden_sizes_selector = hidden_sizes_selector # size of the selector NN defined here self.min_std = min_std self._set_std_to_0 = False self.action_dim = env_spec.action_space.flat_dim # not checking that all the old policies have this act_dim self.old_hidden_sizes = [] # assume json always given for json_path in self.json_paths: data = json.load( open(os.path.join(config.PROJECT_PATH, json_path), 'r')) old_json_policy = data['json_args']["policy"] self.old_hidden_sizes.append(old_json_policy['hidden_sizes']) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance( env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) if self.external_selector: # in case we want to fix the selector externally l_all_obs_var = L.InputLayer( shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, )) all_obs_var = l_all_obs_var.input_var l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.selector_dim, param=self.shared_selector_var, trainable=False) selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the selector! selector_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim, ), output_dim=self.selector_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = selector_network.input_layer all_obs_var = selector_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to selectors) l_selection = selector_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # create the action networks self.old_l_means = [ ] # I do this self in case I wanna access it from reset self.old_l_log_stds = [] self.old_layers = [] for i in range(self.selector_dim): mean_network = MLP( input_layer=l_obs_robot, output_dim=self.action_dim, hidden_sizes=self.old_hidden_sizes[i], hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP{}".format(i), ) self.old_l_means.append(mean_network.output_layer) self.old_layers += mean_network.layers l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=self.action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std{}".format(i), trainable=learn_std, ) self.old_l_log_stds.append(l_log_std) self.old_layers += [l_log_std] if not self.trainable_old: for layer in self.old_layers: for param, tags in layer.params.items( ): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_paths and self.npz_paths: old_params_dict = {} for i, npz_path in enumerate(self.npz_paths): params_dict = dict( np.load(os.path.join(config.PROJECT_PATH, npz_path))) renamed_warm_params_dict = {} for key in params_dict.keys(): if key == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = params_dict[key] elif 'meanMLP_' == key[:8]: old_params_dict['meanMLP{}_'.format(i) + key[8:]] = params_dict[key] else: old_params_dict['meanMLP{}_'.format(i) + key] = params_dict[key] self.set_old_params(old_params_dict) elif self.pkl_paths: old_params_dict = {} for i, pkl_path in enumerate(self.pkl_paths): data = joblib.load(os.path.join(config.PROJECT_PATH, pkl_path)) params = data['policy'].get_params_internal() for param in params: if param.name == 'output_log_std.param': old_params_dict['output_log_std{}.param'.format( i)] = param.get_value() elif 'meanMLP_' == param.name[:8]: old_params_dict['meanMLP{}_'.format(i) + param.name[8:]] = param.get_value() else: old_params_dict['meanMLP{}_'.format(i) + param.name] = param.get_value() self.set_old_params(old_params_dict) # new layers actually selecting the correct output l_mean = SumProdLayer(self.old_l_means + [l_selection]) l_log_std = SumProdLayer(self.old_l_log_stds + [l_selection]) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(self.action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_multi_hier, self).__init__(env_spec) self._f_old_means = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ]) self._f_all_inputs = ext.compile_function( inputs=[all_obs_var], outputs=[ L.get_output(l_old_mean) for l_old_mean in self.old_l_means ] + [selection_var]) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, ) def get_old_params(self): params = [] for layer in self.old_layers: params += layer.get_params() return params # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names def set_old_params(self, old_params): if type( old_params ) is dict: # if the old_params are a dict with the param name as key and a numpy array as value params_value_by_name = old_params elif type( old_params ) is list: # if the old_params are a list of theano variables params_value_by_name = {} for param in old_params: params_value_by_name[param.name] = param.get_value() else: params_value_by_name = {} print("The old_params was not understood!") local_params = self.get_old_params() for param in local_params: param.set_value(params_value_by_name[param.name]) def dist_info_sym(self, obs_var, state_info_var=None): mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], obs_var) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) return dict(mean=mean_var, log_std=log_std_var) @overrides def get_action(self, observation): actions, outputs = self.get_actions([observation]) return actions[0], {k: v[0] for k, v in outputs.items()} def get_actions(self, observations): selector_output = self._f_select(observations) mean, log_std = self._f_dist(observations) if self._set_std_to_0: actions = mean log_std = -1e6 * np.ones_like(log_std) else: rnd = np.random.normal(size=mean.shape) actions = rnd * np.exp(log_std) + mean return actions, dict(mean=mean, log_std=log_std, selectors=selector_output) def set_pre_fix_selector(self, selector): self.pre_fix_selector = np.array(selector) def unset_pre_fix_selector(self): self.pre_fix_selector = np.array([]) @contextmanager def fix_selector(self, selector): self.pre_fix_selector = np.array(selector) yield self.pre_fix_selector = np.array([]) @contextmanager def set_std_to_0(self): self._set_std_to_0 = True yield self._set_std_to_0 = False @overrides def reset( self ): # executed at the start of every rollout. Will fix the selector if needed. if self.pre_fix_selector.size > 0: self.selector_fix = self.pre_fix_selector # this is needed for the external selector self.shared_selector_var.set_value(np.array(self.selector_fix)) def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds))) logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds))) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds))) @property def distribution(self): return self._dist def log_likelihood(self, actions, agent_infos, action_only=True): # First compute logli of the action. This assumes the selectors FIX to whatever was sampled, and hence we only # need to use the mean and log_std, but not any information about the selectors logli = self._dist.log_likelihood(actions, agent_infos) if not action_only: raise NotImplementedError # if not action_only: # for idx, selector_name in enumerate(self._selector_distributions): # selector_var = dist_info["selector_%d" % idx] # prefix = "selector_%d_" % idx # selector_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith( # prefix)} # logli += selector_name.log_likelihood(selector_var, selector_dist_info) return logli
class GaussianMLPPolicy_snn_hier(StochasticPolicy, LasagnePowered, Serializable): # also inherits form Parametrized @autoargs.arg('hidden_sizes', type=int, nargs='*', help='list of sizes for the fully-connected hidden layers') @autoargs.arg('std_sizes', type=int, nargs='*', help='list of sizes for the fully-connected layers for std, note' 'there is a difference in semantics than above: here an empty' 'list means that std is independent of input and the last size is ignored') @autoargs.arg('initial_std', type=float, help='Initial std') @autoargs.arg('std_trainable', type=bool, help='Is std trainable') @autoargs.arg('output_nl', type=str, help='nonlinearity for the output layer') @autoargs.arg('nonlinearity', type=str, help='nonlinearity used for each hidden layer, can be one ' 'of tanh, sigmoid') @autoargs.arg('bn', type=bool, help='whether to apply batch normalization to hidden layers') def __init__( self, env_spec, env, pkl_path=None, json_path=None, npz_path=None, trainable_snn=True, ##CF - latents units at the input latent_dim=3, # we keep all these as the dim of the output of the other MLP and others that we will need! latent_name='categorical', bilinear_integration=False, # again, needs to match! resample=False, # this can change: frequency of resampling the latent? hidden_sizes_snn=(32, 32), hidden_sizes_selector=(10, 10), external_latent=False, learn_std=True, init_std=1.0, adaptive_std=False, std_share_network=False, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=NL.tanh, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, min_std=1e-4, ): self.latent_dim = latent_dim ## could I avoid needing this self for the get_action? self.latent_name = latent_name self.bilinear_integration = bilinear_integration self.resample = resample self.min_std = min_std self.hidden_sizes_snn = hidden_sizes_snn self.hidden_sizes_selector = hidden_sizes_selector self.pre_fix_latent = np.array([]) # if this is not empty when using reset() it will use this latent self.latent_fix = np.array([]) # this will hold the latents variable sampled in reset() self.shared_latent_var = theano.shared(self.latent_fix) # this is for external lat! update that self._set_std_to_0 = False self.trainable_snn = trainable_snn self.external_latent = external_latent self.pkl_path = pkl_path self.json_path = json_path self.npz_path = npz_path self.old_policy = None if self.json_path: # there is another one after defining all the NN to warm-start the params of the SNN data = json.load( open(os.path.join(config.PROJECT_PATH, self.json_path), 'r')) # I should do this with the json file self.old_policy_json = data['json_args']["policy"] self.latent_dim = self.old_policy_json['latent_dim'] self.latent_name = self.old_policy_json['latent_name'] self.bilinear_integration = self.old_policy_json['bilinear_integration'] self.resample = self.old_policy_json['resample'] # this could not be needed... self.min_std = self.old_policy_json['min_std'] self.hidden_sizes_snn = self.old_policy_json['hidden_sizes'] elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) self.old_policy = data["policy"] self.latent_dim = self.old_policy.latent_dim self.latent_name = self.old_policy.latent_name self.bilinear_integration = self.old_policy.bilinear_integration self.resample = self.old_policy.resample # this could not be needed... self.min_std = self.old_policy.min_std self.hidden_sizes_snn = self.old_policy.hidden_sizes if self.latent_name == 'normal': self.latent_dist = DiagonalGaussian(self.latent_dim) self.latent_dist_info = dict(mean=np.zeros(self.latent_dim), log_std=np.zeros(self.latent_dim)) elif self.latent_name == 'bernoulli': self.latent_dist = Bernoulli(self.latent_dim) self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim)) elif self.latent_name == 'categorical': self.latent_dist = Categorical(self.latent_dim) if self.latent_dim > 0: self.latent_dist_info = dict(prob=1. / self.latent_dim * np.ones(self.latent_dim)) else: self.latent_dist_info = dict(prob=np.ones(self.latent_dim)) # this is an empty array else: raise NotImplementedError Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) # retrieve dimensions and check consistency if isinstance(env, MazeEnv) or isinstance(env, GatherEnv): self.obs_robot_dim = env.robot_observation_space.flat_dim self.obs_maze_dim = env.maze_observation_space.flat_dim elif isinstance(env, NormalizedEnv): if isinstance(env.wrapped_env, MazeEnv) or isinstance(env.wrapped_env, GatherEnv): self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim else: self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim self.obs_maze_dim = 0 else: self.obs_robot_dim = env.observation_space.flat_dim self.obs_maze_dim = 0 # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim) all_obs_dim = env_spec.observation_space.flat_dim assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim if self.external_latent: # in case we want to fix the latent externally l_all_obs_var = L.InputLayer(shape=(None,) + (self.obs_robot_dim + self.obs_maze_dim,)) all_obs_var = l_all_obs_var.input_var # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var) l_selection = ParamLayer(incoming=l_all_obs_var, num_units=self.latent_dim, param=self.shared_latent_var, trainable=False) # Rui: change False to True? this is a simple layer that directly outputs self.shared_latent_var selection_var = L.get_output(l_selection) else: # create network with softmax output: it will be the latent 'selector'! latent_selection_network = MLP( input_shape=(self.obs_robot_dim + self.obs_maze_dim,), output_dim=self.latent_dim, hidden_sizes=self.hidden_sizes_selector, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=NL.softmax, ) l_all_obs_var = latent_selection_network.input_layer all_obs_var = latent_selection_network.input_layer.input_var # collect the output to select the behavior of the robot controller (equivalent to latents) l_selection = latent_selection_network.output_layer selection_var = L.get_output(l_selection) # split all_obs into the robot and the maze obs --> ROBOT goes first!! l_obs_robot = CropLayer(l_all_obs_var, start_index=None, end_index=self.obs_robot_dim) l_obs_maze = CropLayer(l_all_obs_var, start_index=self.obs_robot_dim, end_index=None) # for _ in range(10): # print("OK!") # print(self.obs_robot_dim) # print(self.obs_maze_dim) obs_robot_var = all_obs_var[:, :self.obs_robot_dim] obs_maze_var = all_obs_var[:, self.obs_robot_dim:] # Enlarge obs with the selectors (or latents). Here just computing the final input dim if self.bilinear_integration: l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection]) else: l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection]) action_dim = env_spec.action_space.flat_dim # create the action network mean_network = MLP( input_layer=l_obs_snn, # input the layer that handles the integration of the selector output_dim=action_dim, hidden_sizes=self.hidden_sizes_snn, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, name="meanMLP", ) self._layers_mean = mean_network.layers l_mean = mean_network.output_layer if adaptive_std: log_std_network = MLP( input_layer=l_obs_snn, output_dim=action_dim, hidden_sizes=std_hidden_sizes, hidden_nonlinearity=std_hidden_nonlinearity, output_nonlinearity=None, name="log_stdMLP" ) l_log_std = log_std_network.output_layer self._layers_log_std = log_std_network.layers else: l_log_std = ParamLayer( incoming=mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._layers_log_std = [l_log_std] self._layers_snn = self._layers_mean + self._layers_log_std # this returns a list with the "snn" layers if not self.trainable_snn: for layer in self._layers_snn: for param, tags in layer.params.items(): # params of layer are OrDict: key=the shared var, val=tags tags.remove("trainable") if self.json_path and self.npz_path: warm_params_dict = dict(np.load(os.path.join(config.PROJECT_PATH, self.npz_path))) # keys = list(param_dict.keys()) self.set_params_snn(warm_params_dict) elif self.pkl_path: data = joblib.load(os.path.join(config.PROJECT_PATH, self.pkl_path)) warm_params = data['policy'].get_params_internal() self.set_params_snn(warm_params) mean_var, log_std_var = L.get_output([l_mean, l_log_std]) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) self._l_mean = l_mean self._l_log_std = l_log_std self._dist = DiagonalGaussian(action_dim) LasagnePowered.__init__(self, [l_mean, l_log_std]) super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec) # debug obs_snn_var = L.get_output(l_obs_snn) self._l_obs_snn = ext.compile_function( inputs=[all_obs_var], outputs=obs_snn_var, ) # self._log_std = ext.compile_function( # inputs=[all_obs_var], # outputs=log_std_var, # ) self._mean = ext.compile_function( inputs=[all_obs_var], outputs=mean_var, ) self._f_dist = ext.compile_function( inputs=[all_obs_var], outputs=[mean_var, log_std_var], ) # if I want to monitor the selector output self._f_select = ext.compile_function( inputs=[all_obs_var], outputs=selection_var, ) # # I shouldn't need the latent space anymore @property def latent_space(self): return Box(low=-np.inf, high=np.inf, shape=(1,)) def get_params_snn(self): params = [] for layer in self._layers_snn: params += layer.get_params() return params # another way will be to do as in parametrized.py and flatten_tensors (in numpy). But with this I check names def set_params_snn(self, snn_params): if type( snn_params) is dict: # if the snn_params are a dict with the param name as key and a numpy array as value params_value_by_name = snn_params elif type(snn_params) is list: # if the snn_params are a list of theano variables **NOT CHECKING THIS!!** params_value_by_name = {} for param in snn_params: # print("old", param.name) params_value_by_name[param.name] = param.get_value() else: params_value_by_name = {} print("The snn_params was not understood!") local_params = self.get_params_snn() for param in local_params: # print("new", param.name) param.set_value(params_value_by_name[param.name]) def dist_info_sym(self, obs_var, state_info_var=None): mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std], obs_var) if self.min_std is not None: log_std_var = TT.maximum(log_std_var, np.log(self.min_std)) return dict(mean=mean_var, log_std=log_std_var) @overrides def get_action(self, observation): # print("obeservation", len(observation)) # print("env", env) actions, outputs = self.get_actions([observation]) return actions[0], {k: v[0] for k, v in outputs.items()} def get_actions(self, observations): selector_output = self._f_select(observations) # print("obeservation", len(observations)) mean, log_std = self._f_dist(observations) if self._set_std_to_0: actions = mean log_std = -1e6 * np.ones_like(log_std) else: rnd = np.random.normal(size=mean.shape) actions = rnd * np.exp(log_std) + mean return actions, dict(mean=mean, log_std=log_std, latents=selector_output) def set_pre_fix_latent(self, latent): self.pre_fix_latent = np.array(latent) def unset_pre_fix_latent(self): self.pre_fix_latent = np.array([]) @contextmanager def fix_latent(self, latent): self.pre_fix_latent = np.array(latent) yield self.pre_fix_latent = np.array([]) @contextmanager def set_std_to_0(self): self._set_std_to_0 = True yield self._set_std_to_0 = False @overrides def reset(self): # executed at the start of every rollout. Will fix the latent if needed. if not self.resample: if self.pre_fix_latent.size > 0: self.latent_fix = self.pre_fix_latent else: self.latent_fix = self.latent_dist.sample(self.latent_dist_info) else: pass # this is needed for the external latent!! self.shared_latent_var.set_value(np.array(self.latent_fix)) def log_diagnostics(self, paths): log_stds = np.vstack([path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('MaxPolicyStd', np.max(np.exp(log_stds))) logger.record_tabular('MinPolicyStd', np.min(np.exp(log_stds))) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds))) @property def distribution(self): """ We set the distribution to the policy itself since we need some behavior different from a usual diagonal Gaussian distribution. """ return self._dist def log_likelihood(self, actions, agent_infos, action_only=True): # First compute logli of the action. This assumes the latents FIX to whatever was sampled, and hence we only # need to use the mean and log_std, but not any information about the latents logli = self._dist.log_likelihood(actions, agent_infos) if not action_only: raise NotImplementedError # if not action_only: # for idx, latent_name in enumerate(self._latent_distributions): # latent_var = dist_info["latent_%d" % idx] # prefix = "latent_%d_" % idx # latent_dist_info = {k[len(prefix):]: v for k, v in dist_info.iteritems() if k.startswith( # prefix)} # logli += latent_name.log_likelihood(latent_var, latent_dist_info) return logli
class StochasticGaussianMLPPolicy(StochasticPolicy, LasagnePowered, Serializable): def __init__( self, env_spec, input_latent_vars=None, hidden_sizes=(32, 32), hidden_latent_vars=None, learn_std=True, init_std=1.0, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, ): Serializable.quick_init(self, locals()) assert isinstance(env_spec.action_space, Box) obs_dim = env_spec.observation_space.flat_dim action_dim = env_spec.action_space.flat_dim # create network mean_network = StochasticMLP( input_shape=(obs_dim, ), input_latent_vars=input_latent_vars, output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_latent_vars=hidden_latent_vars, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) l_mean = mean_network.output_layer obs_var = mean_network.input_layer.input_var l_log_std = ParamLayer( mean_network.input_layer, num_units=action_dim, param=lasagne.init.Constant(np.log(init_std)), name="output_log_std", trainable=learn_std, ) self._mean_network = mean_network self._n_latent_layers = len(mean_network.latent_layers) self._l_mean = l_mean self._l_log_std = l_log_std LasagnePowered.__init__(self, [l_mean, l_log_std]) super(StochasticGaussianMLPPolicy, self).__init__(env_spec) outputs = self.dist_info_sym(mean_network.input_var) latent_keys = sorted( set(outputs.keys()).difference({"mean", "log_std"})) extras = get_full_output([self._l_mean, self._l_log_std] + self._mean_network.latent_layers, )[1] latent_distributions = [ extras[layer]["distribution"] for layer in self._mean_network.latent_layers ] self._latent_keys = latent_keys self._latent_distributions = latent_distributions self._dist = DiagonalGaussian(action_dim) self._f_dist_info = ext.compile_function( inputs=[obs_var], outputs=outputs, ) self._f_dist_info_givens = None @property def latent_layers(self): return self._mean_network.latent_layers @property def latent_dims(self): return self._mean_network.latent_dims def dist_info(self, obs, state_infos=None): if state_infos is None or len(state_infos) == 0: return self._f_dist_info(obs) if self._f_dist_info_givens is None: # compile function obs_var = self._mean_network.input_var latent_keys = [ "latent_%d" % idx for idx in range(self._n_latent_layers) ] latent_vars = [ TT.matrix("latent_%d" % idx) for idx in range(self._n_latent_layers) ] latent_dict = dict(list(zip(latent_keys, latent_vars))) self._f_dist_info_givens = ext.compile_function( inputs=[obs_var] + latent_vars, outputs=self.dist_info_sym(obs_var, latent_dict), ) latent_vals = [] for idx in range(self._n_latent_layers): latent_vals.append(state_infos["latent_%d" % idx]) return self._f_dist_info_givens(*[obs] + latent_vals) def reset(self): #here I would sample a latents var. # sample latents # store it in self.something that then goes to all the others pass def dist_info_sym(self, obs_var, state_info_vars=None): if state_info_vars is not None: latent_givens = { latent_layer: state_info_vars["latent_%d" % idx] for idx, latent_layer in enumerate( self._mean_network.latent_layers) } latent_dist_infos = dict() for idx, latent_layer in enumerate( self._mean_network.latent_layers): cur_dist_info = dict() prefix = "latent_%d_" % idx for k, v in state_info_vars.items(): if k.startswith(prefix): cur_dist_info[k[len(prefix):]] = v latent_dist_infos[latent_layer] = cur_dist_info else: latent_givens = dict() latent_dist_infos = dict() all_outputs, extras = get_full_output( [self._l_mean, self._l_log_std] + self._mean_network.latent_layers, inputs={self._mean_network._l_in: obs_var}, latent_givens=latent_givens, latent_dist_infos=latent_dist_infos, ) mean_var = all_outputs[0] log_std_var = all_outputs[1] latent_vars = all_outputs[2:] latent_dist_infos = [] for latent_layer in self._mean_network.latent_layers: latent_dist_infos.append(extras[latent_layer]["dist_info"]) output_dict = dict(mean=mean_var, log_std=log_std_var) for idx, latent_var, latent_dist_info in zip(itertools.count(), latent_vars, latent_dist_infos): output_dict["latent_%d" % idx] = latent_var for k, v in latent_dist_info.items(): output_dict["latent_%d_%s" % (idx, k)] = v return output_dict def kl_sym(self, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic KL divergence of distributions of both the actions and the latents variables """ kl = self._dist.kl_sym(old_dist_info_vars, new_dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): # collect dist info for each latents variable prefix = "latent_%d_" % idx old_latent_dist_info = { k[len(prefix):]: v for k, v in old_dist_info_vars.items() if k.startswith(prefix) } new_latent_dist_info = { k[len(prefix):]: v for k, v in new_dist_info_vars.items() if k.startswith(prefix) } kl += latent_dist.kl_sym(old_latent_dist_info, new_latent_dist_info) return kl def likelihood_ratio_sym(self, action_var, old_dist_info_vars, new_dist_info_vars): """ Compute the symbolic likelihood ratio of both the actions and the latents variables. """ lr = self._dist.likelihood_ratio_sym(action_var, old_dist_info_vars, new_dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): latent_var = old_dist_info_vars["latent_%d" % idx] prefix = "latent_%d_" % idx old_latent_dist_info = { k[len(prefix):]: v for k, v in old_dist_info_vars.items() if k.startswith(prefix) } new_latent_dist_info = { k[len(prefix):]: v for k, v in new_dist_info_vars.items() if k.startswith(prefix) } lr *= latent_dist.likelihood_ratio_sym(latent_var, old_latent_dist_info, new_latent_dist_info) return lr def log_likelihood(self, actions, dist_info, action_only=False): """ Computes the log likelihood of both the actions and the latents variables, unless action_only is set to True, in which case it will only compute the log likelihood of the actions. :return: """ logli = self._dist.log_likelihood(actions, dist_info) if not action_only: for idx, latent_dist in enumerate(self._latent_distributions): latent_var = dist_info["latent_%d" % idx] prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info.items() if k.startswith(prefix) } logli += latent_dist.log_likelihood(latent_var, latent_dist_info) return logli def log_likelihood_sym(self, action_var, dist_info_vars): logli = self._dist.log_likelihood_sym(action_var, dist_info_vars) for idx, latent_dist in enumerate(self._latent_distributions): latent_var = dist_info_vars["latent_%d" % idx] prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info_vars.items() if k.startswith(prefix) } logli += latent_dist.log_likelihood_sym(latent_var, latent_dist_info) return logli def entropy(self, dist_info): ent = self._dist.entropy(dist_info) for idx, latent_dist in enumerate(self._latent_distributions): prefix = "latent_%d_" % idx latent_dist_info = { k[len(prefix):]: v for k, v in dist_info.items() if k.startswith(prefix) } ent += latent_dist.entropy(latent_dist_info) return ent @property def dist_info_keys(self): return ["mean", "log_std"] + self._latent_keys @overrides def get_action(self, observation): actions, outputs = self.get_actions([observation]) return actions[0], {k: v[0] for k, v in outputs.items()} def get_actions(self, observations): outputs = self._f_dist_info(observations) mean = outputs["mean"] log_std = outputs["log_std"] rnd = np.random.normal(size=mean.shape) actions = rnd * np.exp(log_std) + mean return actions, outputs def log_diagnostics(self, paths): log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.record_tabular('AveragePolicyStd', np.mean(np.exp(log_stds))) @property def distribution(self): """ We set the distribution to the policy itself since we need some behavior different from a usual diagonal Gaussian distribution. """ return self @property def state_info_keys(self): return self._latent_keys